<?xml version="1.0" encoding="UTF-8"?><rss xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:googleplay="http://www.google.com/schemas/play-podcasts/1.0"><channel><title><![CDATA[Dennis's Rants]]></title><description><![CDATA[Educational content for game programmers]]></description><link>https://dennisrants.substack.com</link><image><url>https://substackcdn.com/image/fetch/$s_!nfdc!,w_256,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8007a87a-3e29-42a6-a5a0-9ca395bb3960_250x250.png</url><title>Dennis&apos;s Rants</title><link>https://dennisrants.substack.com</link></image><generator>Substack</generator><lastBuildDate>Sat, 11 Apr 2026 13:47:00 GMT</lastBuildDate><atom:link href="https://dennisrants.substack.com/feed" rel="self" type="application/rss+xml"/><copyright><![CDATA[Dennis Andersson]]></copyright><language><![CDATA[en]]></language><webMaster><![CDATA[dennisrants@substack.com]]></webMaster><itunes:owner><itunes:email><![CDATA[dennisrants@substack.com]]></itunes:email><itunes:name><![CDATA[Dennis Andersson]]></itunes:name></itunes:owner><itunes:author><![CDATA[Dennis Andersson]]></itunes:author><googleplay:owner><![CDATA[dennisrants@substack.com]]></googleplay:owner><googleplay:email><![CDATA[dennisrants@substack.com]]></googleplay:email><googleplay:author><![CDATA[Dennis Andersson]]></googleplay:author><itunes:block><![CDATA[Yes]]></itunes:block><item><title><![CDATA[Using DX12 in 2025]]></title><description><![CDATA[Writing a modern abstraction layer for D3D12.]]></description><link>https://dennisrants.substack.com/p/using-dx12-in-2025</link><guid isPermaLink="false">https://dennisrants.substack.com/p/using-dx12-in-2025</guid><dc:creator><![CDATA[Dennis Andersson]]></dc:creator><pubDate>Wed, 02 Apr 2025 14:33:46 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!m54c!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!m54c!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!m54c!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 424w, https://substackcdn.com/image/fetch/$s_!m54c!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 848w, https://substackcdn.com/image/fetch/$s_!m54c!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!m54c!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!m54c!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg" width="724.953125" height="392.3510044642857" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/c4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:false,&quot;imageSize&quot;:&quot;normal&quot;,&quot;height&quot;:788,&quot;width&quot;:1456,&quot;resizeWidth&quot;:724.953125,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!m54c!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 424w, https://substackcdn.com/image/fetch/$s_!m54c!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 848w, https://substackcdn.com/image/fetch/$s_!m54c!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!m54c!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c5eaf0-0917-4cf5-8ab9-b89a4710fbeb_1579x855.jpeg 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>We typically think of D3D12 as a modern graphics API, but with the release of Windows 10 and DirectX 12 in 2015 this already makes it a decade old. Even so drivers are still being improved and Microsoft is actively supporting D3D12 with new updates as hardware evolves. Considering that we are not able to program GPUs directly but instead write code for a driver, D3D12 does many things better than some other APIs. </p><p>For D3D12 to have been released in 2015 it would&#8217;ve had to work on Nvidia Fermi and Kepler series GPUs released in 2010 and 2012. With less than 1% of users according to the February 2025 <a href="https://store.steampowered.com/hwsurvey/">Steam Hardware Survey</a> it&#8217;s hard to argue supporting GPUs prior to Nvidia Pascal. That would free us to design an API around modern features.</p><p>When designing my low-level graphics abstraction layer I imagined the ideal API would be something in-between D3D11 and D3D12. The layer has to be easy to use, save me time and not compromise on performance. If we were to just create a D3D11 style API then we would essentially be creating a driver, which Nvidia and AMD will definitely do better than us.</p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://dennisrants.substack.com/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Dennis's Rants is a reader-supported publication. Please consider becoming a free or paid subscriber.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><p></p><p>So in terms of features, what do I need from a Graphics API from a user perspective?</p><ul><li><p>Swap Chain Management and Presenting</p></li><li><p>Creating/Destroying Resources and Pipelines</p></li><li><p>Recording and Submitting Command Buffers</p></li><li><p>Uploading from CPU to GPU memory</p></li></ul><p>And this is more or less what I ended up with:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!MNgC!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!MNgC!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 424w, https://substackcdn.com/image/fetch/$s_!MNgC!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 848w, https://substackcdn.com/image/fetch/$s_!MNgC!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 1272w, https://substackcdn.com/image/fetch/$s_!MNgC!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!MNgC!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png" width="906" height="553" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:553,&quot;width&quot;:906,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!MNgC!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 424w, https://substackcdn.com/image/fetch/$s_!MNgC!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 848w, https://substackcdn.com/image/fetch/$s_!MNgC!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 1272w, https://substackcdn.com/image/fetch/$s_!MNgC!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72d29a85-a375-4cc6-a7eb-bc2344e2fbe1_906x553.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>As an example, here is a simple use case for drawing:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!sJRo!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!sJRo!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 424w, https://substackcdn.com/image/fetch/$s_!sJRo!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 848w, https://substackcdn.com/image/fetch/$s_!sJRo!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 1272w, https://substackcdn.com/image/fetch/$s_!sJRo!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!sJRo!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png" width="728" height="566.9172932330827" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:false,&quot;imageSize&quot;:&quot;normal&quot;,&quot;height&quot;:725,&quot;width&quot;:931,&quot;resizeWidth&quot;:728,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!sJRo!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 424w, https://substackcdn.com/image/fetch/$s_!sJRo!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 848w, https://substackcdn.com/image/fetch/$s_!sJRo!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 1272w, https://substackcdn.com/image/fetch/$s_!sJRo!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cd41a15-def5-40c3-9a91-cedf076e8cf3_931x725.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><div><hr></div><h2>Pipeline State Objects</h2><p>The most important decision for me is to treat PSOs as a first class citizens in the API design. In D3D12 the PSOs are immutable so modifying a state means creating a whole new PSO. Creating a PSO will have the drivers compile the shaders which may take up to seconds and cause stutters (#StutterStruggle). </p><p>We saw many engines in the early days keep the old D3D11 style API when porting to D3D12. With an API like this: <code> Ctx-&gt;PSSetShader(), Ctx-&gt;OMSetDepthStencilState(),</code> <code>etc.</code> it&#8217;s very difficult to know what PSO combinations will be needed ahead of time which means the PSOs are created on demand. Along with the combinatorial explosion of number of unique PSOs, this results in constant stuttering. Managing the PSOs in these engines are also complex with hash-maps everywhere which is slow.</p><div><hr></div><h2>Bindless</h2><p>I decided to go all-in on bindless resources which eliminates the need for complex descriptor management. Not only does it save a lot of headaches implementation wise but calls to <code>CopyDescriptors</code> can be quite a CPU cost. With Shader Model 6.6 comes the ability to index directly into a resource heap with <code>ResourceDescriptorHeap</code> and <code>SamplerDescriptorHeap</code>.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!lsj5!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!lsj5!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 424w, https://substackcdn.com/image/fetch/$s_!lsj5!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 848w, https://substackcdn.com/image/fetch/$s_!lsj5!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 1272w, https://substackcdn.com/image/fetch/$s_!lsj5!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!lsj5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png" width="697" height="245" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/d4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:245,&quot;width&quot;:697,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!lsj5!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 424w, https://substackcdn.com/image/fetch/$s_!lsj5!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 848w, https://substackcdn.com/image/fetch/$s_!lsj5!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 1272w, https://substackcdn.com/image/fetch/$s_!lsj5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd4ffcf20-3301-4f96-9619-cba1cbe59ea0_697x245.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>I utilize root constants heavily in order to push the various descriptor indices to the shaders.</p><p></p><h4>Bindless Input Layouts</h4><p>Bindless allows us to omit Vertex Input Layouts by leaving that blank in the pipeline state and not binding a vertex buffer during drawing. The advantage of this is that we require less PSO permutations for different vertex formats which makes everything a lot more flexible and one less API call to the driver by not having to bind vertex buffer.</p><p>The idea is to just reference a buffer type in the Vertex Shader and unpack vertex data manually with <code>SV_VertexID</code>. This also allows for some exotic vertex formats and quantization/compression.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!yTNT!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!yTNT!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 424w, https://substackcdn.com/image/fetch/$s_!yTNT!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 848w, https://substackcdn.com/image/fetch/$s_!yTNT!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 1272w, https://substackcdn.com/image/fetch/$s_!yTNT!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!yTNT!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png" width="752" height="462" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/e3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:462,&quot;width&quot;:752,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!yTNT!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 424w, https://substackcdn.com/image/fetch/$s_!yTNT!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 848w, https://substackcdn.com/image/fetch/$s_!yTNT!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 1272w, https://substackcdn.com/image/fetch/$s_!yTNT!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3de4f98-0c8e-4971-a3ee-76f3d81dbc42_752x462.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><div><hr></div><h2>Memory Management</h2><p>In terms of resource memory management I&#8217;m not exposing Resource Heaps directly into the API. Generally speaking there are three cases for resources that I&#8217;m interested in:</p><ul><li><p>Committed memory for large persistent resources such as Render Targets.</p></li><li><p>Temporary Linear memory for uploading from CPU to GPU.</p></li><li><p>Placed memory in GPU for all other semi-persistent resources (textures, buffers, etc. which might get streamed in/out).</p></li></ul><p></p><h4>Persistent Memory</h4><p>For placed resources I implemented a <a href="http://www.gii.upv.es/tlsf/">TLSF allocator</a> which I have seen in many graphics API memory managers. It has an average fragmentation rate of 15% with constant O(1) allocation time. I&#8217;ve measured allocation to be in the order of less than 80 cycles on my machine. Resource Heaps are created in blocks of 256MB from which I suballocate with TLSF.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!AH8-!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!AH8-!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 424w, https://substackcdn.com/image/fetch/$s_!AH8-!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 848w, https://substackcdn.com/image/fetch/$s_!AH8-!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 1272w, https://substackcdn.com/image/fetch/$s_!AH8-!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!AH8-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png" width="870" height="725" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:725,&quot;width&quot;:870,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!AH8-!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 424w, https://substackcdn.com/image/fetch/$s_!AH8-!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 848w, https://substackcdn.com/image/fetch/$s_!AH8-!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 1272w, https://substackcdn.com/image/fetch/$s_!AH8-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F951c0c66-96a7-4178-a654-21ac2e82245a_870x725.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>A <code>TlsfAlloc</code> contains an offset which allows us to use that for calls to <code>CreatePlacedResource</code>:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!-rfY!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!-rfY!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 424w, https://substackcdn.com/image/fetch/$s_!-rfY!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 848w, https://substackcdn.com/image/fetch/$s_!-rfY!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 1272w, https://substackcdn.com/image/fetch/$s_!-rfY!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!-rfY!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png" width="862" height="291" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:291,&quot;width&quot;:862,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!-rfY!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 424w, https://substackcdn.com/image/fetch/$s_!-rfY!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 848w, https://substackcdn.com/image/fetch/$s_!-rfY!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 1272w, https://substackcdn.com/image/fetch/$s_!-rfY!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17860404-7dbf-424f-bef5-60f8cef9d92d_862x291.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p></p><h4>Temporary Upload Memory</h4><p>When uploading resources I use a simple scratch allocator that is reset each frame. The idea is to have a large Buffer resource in upload heap that is always mapped, and when allocating scratch space we just bump the mapped pointer. This can then be copied to the persistent resource on the GPU. Creating a resource and uploading it to GPU memory looks like this for the user:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!F8Zj!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!F8Zj!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 424w, https://substackcdn.com/image/fetch/$s_!F8Zj!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 848w, https://substackcdn.com/image/fetch/$s_!F8Zj!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 1272w, https://substackcdn.com/image/fetch/$s_!F8Zj!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!F8Zj!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png" width="821" height="368" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/b053ed38-8cb4-4465-a928-542daa684dd5_821x368.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:368,&quot;width&quot;:821,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!F8Zj!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 424w, https://substackcdn.com/image/fetch/$s_!F8Zj!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 848w, https://substackcdn.com/image/fetch/$s_!F8Zj!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 1272w, https://substackcdn.com/image/fetch/$s_!F8Zj!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb053ed38-8cb4-4465-a928-542daa684dd5_821x368.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This is much better than creating Placed Resources in an Upload Heap. Implementing this system is very easy and it&#8217;s as fast as you&#8217;ll get &#8212; not more than a dozen cycles as all we do is increment a pointer. Here is the complete implementation:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!znbC!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!znbC!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 424w, https://substackcdn.com/image/fetch/$s_!znbC!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 848w, https://substackcdn.com/image/fetch/$s_!znbC!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 1272w, https://substackcdn.com/image/fetch/$s_!znbC!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!znbC!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png" width="856" height="1142" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/d0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1142,&quot;width&quot;:856,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!znbC!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 424w, https://substackcdn.com/image/fetch/$s_!znbC!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 848w, https://substackcdn.com/image/fetch/$s_!znbC!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 1272w, https://substackcdn.com/image/fetch/$s_!znbC!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd0dae213-b4af-41b8-a836-211d7e8f1410_856x1142.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><div><hr></div><h2>Resource Table</h2><p>Creating various resources in D3D12 typically returns a pointer to <code>ID3D12Resource</code>. We could just expose this directly in the abstraction layer with an opaque void pointer type but I&#8217;m not a fan of passing around pointers everywhere. You typically also need to store auxiliary data to the actual resource pointer as well.</p><p>I&#8217;ve implemented a simple handle and table system for storing resources. The user of the API don&#8217;t really need to know the internals of the data so just exposing a <code>ResourceHandle</code> is enough. I don&#8217;t like randomly allocating tiny objects all over the place because of performance and it makes lifetime management difficult. By the way, this is why people use &#8216;smart pointers&#8217; everywhere. A much more simple solution is to just have a large blob of memory and use it as a pool or table.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!TcZZ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!TcZZ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 424w, https://substackcdn.com/image/fetch/$s_!TcZZ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 848w, https://substackcdn.com/image/fetch/$s_!TcZZ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 1272w, https://substackcdn.com/image/fetch/$s_!TcZZ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!TcZZ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png" width="733" height="726" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/b311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:726,&quot;width&quot;:733,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:44845,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://dennisrants.substack.com/i/160084709?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!TcZZ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 424w, https://substackcdn.com/image/fetch/$s_!TcZZ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 848w, https://substackcdn.com/image/fetch/$s_!TcZZ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 1272w, https://substackcdn.com/image/fetch/$s_!TcZZ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb311bf26-1a8a-446a-8a2e-5807d06114dc_733x726.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p></p><p><code>ResourceHandle</code> could&#8217;ve just been an index, however someone could keep that index around without knowing that the resource was destroyed. For that we need a Version or Generation counter next to the index. The table will also store an array of Version counters, one for each Handle/Index. The idea is simple, when we fetch a resource we compare the Version in the <code>ResourceHandle</code> with the version in the table. We can then tell if the reference is valid:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!S1o5!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!S1o5!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 424w, https://substackcdn.com/image/fetch/$s_!S1o5!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 848w, https://substackcdn.com/image/fetch/$s_!S1o5!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 1272w, https://substackcdn.com/image/fetch/$s_!S1o5!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!S1o5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png" width="847" height="1050" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/e37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1050,&quot;width&quot;:847,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!S1o5!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 424w, https://substackcdn.com/image/fetch/$s_!S1o5!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 848w, https://substackcdn.com/image/fetch/$s_!S1o5!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 1272w, https://substackcdn.com/image/fetch/$s_!S1o5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe37e3bd6-6ba9-435a-bb08-d69fc1fb88d9_847x1050.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>You could instead choose to store the Versions inside the actual <code>Resource</code> struct itself. I&#8217;m a fairly pragmatic programmer so that decision in my mind would depend on what the most common data access pattern is, and it&#8217;s effect on performance. The same goes for if you frequently need to iterate over resources of a common type &#8212; you could have multiple tables, one for each type.</p><div><hr></div><h2>Descriptor Management</h2><p>Using bindless significantly simplifies descriptor management, however we still have to allocate descriptor slots for resources and handle frame buffering. The <code>DescriptorAllocator</code> is a simple struct that just contain free slots which is claimed when allocating a View.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!f3rB!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!f3rB!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 424w, https://substackcdn.com/image/fetch/$s_!f3rB!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 848w, https://substackcdn.com/image/fetch/$s_!f3rB!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 1272w, https://substackcdn.com/image/fetch/$s_!f3rB!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!f3rB!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png" width="839" height="764" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/9b663291-9577-47da-adea-c9d56505dd2c_839x764.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:764,&quot;width&quot;:839,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!f3rB!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 424w, https://substackcdn.com/image/fetch/$s_!f3rB!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 848w, https://substackcdn.com/image/fetch/$s_!f3rB!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 1272w, https://substackcdn.com/image/fetch/$s_!f3rB!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9b663291-9577-47da-adea-c9d56505dd2c_839x764.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Frame buffering descriptors become a little bit more complicated. My approach is to have one <code>ID3D12DescriptorHeap</code> per frame and a staging <code>ID3D12DescriptorHeap</code> which is non-shader visible. When allocating/freeing a descriptor we do that in both the current frame Heap and the staging Heap, and at the end of the frame call <code>CopyDescriptors</code> to copy the Staging Heap to the Next Frame Heap. Unfortunately D3D12 requires the source operand of <code>CopyDescriptors</code> <a href="https://learn.microsoft.com/en-us/windows/win32/api/d3d12/nf-d3d12-id3d12device-copydescriptors">to be non-shader visible</a> which is why we need the staging heap and can&#8217;t just copy between the frames directly.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!jVRX!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!jVRX!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 424w, https://substackcdn.com/image/fetch/$s_!jVRX!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 848w, https://substackcdn.com/image/fetch/$s_!jVRX!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 1272w, https://substackcdn.com/image/fetch/$s_!jVRX!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!jVRX!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png" width="897" height="1244" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1244,&quot;width&quot;:897,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!jVRX!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 424w, https://substackcdn.com/image/fetch/$s_!jVRX!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 848w, https://substackcdn.com/image/fetch/$s_!jVRX!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 1272w, https://substackcdn.com/image/fetch/$s_!jVRX!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baca853-d5b1-4f36-9be6-0baa80797430_897x1244.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Overall my descriptor management code is fairly lightweight at around 250 LOC.</p><div><hr></div><h2>Barriers</h2><p>Unfortunately I don&#8217;t really have anything special for barriers, they are pretty much just mapped 1:1 with D3D12. I haven&#8217;t looked too much into the new Enhanced Barriers yet, but maybe those allows us to simplify things.</p><div><hr></div><h2>Conclusion</h2><p>I&#8217;m pretty happy with the state of my low-level API. It&#8217;s enabled to me to do pretty much everything that I need of it. There have been some modifications recently to support <code>ExecuteIndirect</code>, but apart from that my API has remained the same for 1.5 years. Of course, if I were to ship something I suspect there might be something I&#8217;d have to reconsider in the backend implementation, but the front facing API seems to cover most use cases.</p><p>&gt; 90% of the renderer time is spent in driver code so I am happy about the performance vs. simplicity trade-offs, writing graphics code requires significantly less code than raw D3D12.</p><p>In the future I hope for a world where we could just program the GPUs directly and not have to rely on Graphics APIs and Drivers. </p><div><hr></div><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://dennisrants.substack.com/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Dennis's Rants is a reader-supported publication. To receive new posts and support my work, consider becoming a free or paid subscriber.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div>]]></content:encoded></item><item><title><![CDATA[How-To: SIMD Programming]]></title><description><![CDATA[An introduction to practical SIMD programming and common pitfalls]]></description><link>https://dennisrants.substack.com/p/how-to-simd-programming</link><guid isPermaLink="false">https://dennisrants.substack.com/p/how-to-simd-programming</guid><dc:creator><![CDATA[Dennis Andersson]]></dc:creator><pubDate>Fri, 27 Sep 2024 11:15:53 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!Indx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg" length="0" type="image/jpeg"/><content:encoded><![CDATA[<p>If you&#8217;ve been programming for a while, especially at a low level, you have almost certainly heard of SIMD. <strong>Single instruction, multiple data</strong> (SIMD) is exactly what it sounds like &#8212; it allows you to process multiple pieces of data with a single instruction. I like SIMD because it can often lead to <strong>4x</strong>, <strong>8x</strong> or even <strong>16x</strong> performance speed ups when used correctly.</p><p>This post is mostly aimed at beginner&#8212;intermediate developers who haven&#8217;t programmed with SIMD a lot, but this might still serve as a good refresher for experienced programmers. This should be a good introduction to SIMD but don&#8217;t expect to become an expert after just reading this post ;)</p><p><em>Please also note that in this post we will only focus on x86/x64 SIMD. Other architectures are not covered. </em></p><div><hr></div><h5>In this post we will:</h5><ol><li><p>Understand the Basics of SIMD</p></li><li><p>Figure out how to Query SIMD support on PC</p></li><li><p>Know the different ways we can program SIMD</p><ol><li><p>Compare pros &amp; cons of each approach</p></li></ol></li><li><p>Convert an example to use SIMD &#8212; with benchmarks</p><ol><li><p>The typical Na&#239;ve approach</p></li><li><p>Doing it the right way with the correct mental model</p></li></ol></li><li><p>Cover some tips &amp; tricks to common problems when using SIMD</p></li><li><p>Notes on Architecting systems to be optimizable with SIMD</p></li></ol><div><hr></div><h2>What is SIMD?</h2><p>SIMD is an actual piece of silicon in the CPU. Here is a Die shot of AMD&#8217;s &#8220;Zen 1&#8221; core<a class="footnote-anchor" data-component-name="FootnoteAnchorToDOM" id="footnote-anchor-1" href="#footnote-1" target="_self">1</a> with the SIMD units highlighted in red:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Indx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Indx!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 424w, https://substackcdn.com/image/fetch/$s_!Indx!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 848w, https://substackcdn.com/image/fetch/$s_!Indx!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!Indx!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Indx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg" width="562" height="426.95631067961165" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:626,&quot;width&quot;:824,&quot;resizeWidth&quot;:562,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Indx!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 424w, https://substackcdn.com/image/fetch/$s_!Indx!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 848w, https://substackcdn.com/image/fetch/$s_!Indx!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!Indx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6fa3e96f-015f-4e3c-a173-4c86bfec6c0e_824x626.jpeg 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Image courtesy of Fritzchens Fritz</figcaption></figure></div><p>You might notice that the SIMD region has this sort of grid-like pattern. I&#8217;m not a hardware person so I don&#8217;t know for sure why it looks like that, but when we dive into how SIMD works &#8212; I think it makes sense why the hardware might look like this.</p><p>A simple SIMD example would be the ability to multiply pairs of 4 different 32-bit values with a single MULTIPLY instruction, instead of needing 4 individual MULTIPLY instructions. The time it takes to complete a SIMD MULTIPLY is usually the same as a regular MULTIPLY, which means in our example we can get a 4x speedup using SIMD. </p><p>We are not just restricted to multiplication, there are a plethora of SIMD instruction types available: from common arithmetic operations such as Integer/Float <code>add</code>, <code>subtract</code> and <code>multiply</code>, to bit manipulation such as left/right shift and bitwise <code>or/and/xor</code>. We can also perform comparisons (equal to, greater than, less than, etc.), and perform memory operations such as <code>load/store/move</code>.</p><div><hr></div><h2>SIMD Registers &amp; Lanes</h2><p>When working with SIMD we use special registers which may be 128-bit, 256-bit or even 512-bit wide. These registers are then divided up in to &#8220;<strong>lanes</strong>&#8221;. Each lane contains one piece of data that the SIMD instruction will operate on. As an example: a 128-bit register with 4 lanes gives us 4x 32-bit values:</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!-I3-!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!-I3-!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 424w, https://substackcdn.com/image/fetch/$s_!-I3-!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 848w, https://substackcdn.com/image/fetch/$s_!-I3-!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 1272w, https://substackcdn.com/image/fetch/$s_!-I3-!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!-I3-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png" width="414" height="228" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/a8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:228,&quot;width&quot;:414,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!-I3-!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 424w, https://substackcdn.com/image/fetch/$s_!-I3-!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 848w, https://substackcdn.com/image/fetch/$s_!-I3-!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 1272w, https://substackcdn.com/image/fetch/$s_!-I3-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8cee3da-57f9-4db2-9890-77a77bbc891e_414x228.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>As I just mentioned SIMD carries out its operations on a lane by lane basis. This means if we issue a SIMD ADD instruction between two registers, it will perform the add for each lane &#8220;vertically&#8221;:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!XEdo!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!XEdo!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 424w, https://substackcdn.com/image/fetch/$s_!XEdo!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 848w, https://substackcdn.com/image/fetch/$s_!XEdo!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 1272w, https://substackcdn.com/image/fetch/$s_!XEdo!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!XEdo!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png" width="428" height="353.81333333333333" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:372,&quot;width&quot;:450,&quot;resizeWidth&quot;:428,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!XEdo!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 424w, https://substackcdn.com/image/fetch/$s_!XEdo!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 848w, https://substackcdn.com/image/fetch/$s_!XEdo!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 1272w, https://substackcdn.com/image/fetch/$s_!XEdo!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0e7da6b9-8b2d-4cb3-9822-87a0af8c7bca_450x372.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Another way to think about this is to image you are adding the values of 2x 4-element arrays together to produce a new 4-element array.</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!yTlL!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!yTlL!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 424w, https://substackcdn.com/image/fetch/$s_!yTlL!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 848w, https://substackcdn.com/image/fetch/$s_!yTlL!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 1272w, https://substackcdn.com/image/fetch/$s_!yTlL!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!yTlL!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png" width="245" height="156" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/e64c98b1-3498-4260-b8c4-1060988535f5_245x156.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:156,&quot;width&quot;:245,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!yTlL!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 424w, https://substackcdn.com/image/fetch/$s_!yTlL!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 848w, https://substackcdn.com/image/fetch/$s_!yTlL!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 1272w, https://substackcdn.com/image/fetch/$s_!yTlL!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe64c98b1-3498-4260-b8c4-1060988535f5_245x156.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>Understanding this concept is the most important part of SIMD programming. Working &#8220;across/horizontally&#8221; between lanes is not really something you do in SIMD. There are some cases where working horizontally is the correct operation, but in most cases I would argue you are probably doing something wrong if you need to go horizontal.  This will make more sense later in the article when we look at benchmarks.</p><div><hr></div><h2>CPU Support for SIMD</h2><p>How wide the registers can be and what kind of SIMD instructions are available depends on what the CPU supports. And here is where it gets a little bit complicated.</p><p>SIMD support comes in &#8220;Instruction set architecture&#8221; (ISA). The first SIMD ISA that was added to x86 was Intel&#8217;s Streaming SIMD Extension (SSE) in 1999. SSE supports 128-bit registers and has been upgraded over the years all the way up to SSE4. The &#8220;Advanced Vector Extensions&#8221; (AVX) ISA introduced 256-bit registers, and the AVX-512 ISA supports 512-bit registers.</p><p>The support varies greatly depending on the hardware, you can query ISA support using the <code>cpuid</code><a href="https://en.wikipedia.org/wiki/CPUID"> feature bits</a><a class="footnote-anchor" data-component-name="FootnoteAnchorToDOM" id="footnote-anchor-2" href="#footnote-2" target="_self">2</a>. Here&#8217;s an example on Windows using the <code>__cpuid</code> and <code>__cpuidex</code><a href="https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex"> intrinsics</a><a class="footnote-anchor" data-component-name="FootnoteAnchorToDOM" id="footnote-anchor-3" href="#footnote-3" target="_self">3</a>:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!dWbv!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!dWbv!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 424w, https://substackcdn.com/image/fetch/$s_!dWbv!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 848w, https://substackcdn.com/image/fetch/$s_!dWbv!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 1272w, https://substackcdn.com/image/fetch/$s_!dWbv!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!dWbv!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png" width="610" height="677.1485411140584" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/20a653ed-e509-4f72-a606-0ced7764a921_754x837.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:837,&quot;width&quot;:754,&quot;resizeWidth&quot;:610,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!dWbv!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 424w, https://substackcdn.com/image/fetch/$s_!dWbv!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 848w, https://substackcdn.com/image/fetch/$s_!dWbv!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 1272w, https://substackcdn.com/image/fetch/$s_!dWbv!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F20a653ed-e509-4f72-a606-0ced7764a921_754x837.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>If you attempt to execute SIMD instructions on a machine that doesn&#8217;t have support for that instruction then your program will fault and crash. Thus, it&#8217;s important that you ensure the target hardware has support for whatever SIMD ISA you put in your program. </p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!6sFq!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!6sFq!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 424w, https://substackcdn.com/image/fetch/$s_!6sFq!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 848w, https://substackcdn.com/image/fetch/$s_!6sFq!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 1272w, https://substackcdn.com/image/fetch/$s_!6sFq!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!6sFq!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png" width="346" height="425.9095238095238" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/fdef61ee-9c99-4453-82ed-12597354e90a_420x517.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:false,&quot;imageSize&quot;:&quot;normal&quot;,&quot;height&quot;:517,&quot;width&quot;:420,&quot;resizeWidth&quot;:346,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!6sFq!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 424w, https://substackcdn.com/image/fetch/$s_!6sFq!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 848w, https://substackcdn.com/image/fetch/$s_!6sFq!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 1272w, https://substackcdn.com/image/fetch/$s_!6sFq!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffdef61ee-9c99-4453-82ed-12597354e90a_420x517.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Table of ISAs and when they were first introduced</figcaption></figure></div><p>I would recommend picking a min-spec ISA that users require to run the program. As of April 2024,  <a href="https://store.steampowered.com/hwsurvey/Steam-Hardware-Software-Survey">&gt;99% of users on Steam has SSE4.2 support</a><a class="footnote-anchor" data-component-name="FootnoteAnchorToDOM" id="footnote-anchor-4" href="#footnote-4" target="_self">4</a> which makes it a very reasonable min-spec for games. Nearly 94% of users support the AVX2 ISA which makes it very tempting for us since the register size is double that of SSE &#8212; meaning up to double the performance. But leaving out 6% of potential customers is not insignificant, so I would not recommend using it as min-spec. A better approach would be to have SSE4.2 min-spec but only enable AVX2 for those which support it. This sounds simple but can be hard in practice. We will look more into managing this later.</p><p></p><div><hr></div><h2>Getting started with SIMD</h2><p>There are a few different ways that you can program with SIMD. I&#8217;ll cover three different approaches and then finally go over pros&#8212;cons of each one.</p><h4>Explicit SIMD with intrinsics </h4><p>First, you can program SIMD explicitly by using whatever intrinsic functions that are exposed in your language. On Windows (MSVC) C/C++ the available functions and corresponding header files can be found on the <a href="https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html">Intel Intrinsics Guide</a><a class="footnote-anchor" data-component-name="FootnoteAnchorToDOM" id="footnote-anchor-5" href="#footnote-5" target="_self">5</a>.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!bXL_!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!bXL_!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 424w, https://substackcdn.com/image/fetch/$s_!bXL_!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 848w, https://substackcdn.com/image/fetch/$s_!bXL_!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 1272w, https://substackcdn.com/image/fetch/$s_!bXL_!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!bXL_!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png" width="552" height="606.6174142480211" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/acacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:833,&quot;width&quot;:758,&quot;resizeWidth&quot;:552,&quot;bytes&quot;:51228,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!bXL_!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 424w, https://substackcdn.com/image/fetch/$s_!bXL_!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 848w, https://substackcdn.com/image/fetch/$s_!bXL_!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 1272w, https://substackcdn.com/image/fetch/$s_!bXL_!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Facacb839-e80f-4c7f-9d67-75eca7b975b9_758x833.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Image courtesy of Intel&#174;</figcaption></figure></div><p>This is an awesome resource and is always on my second monitor when I program SIMD. It contains all available intrinsics for each ISA and detailed descriptions of them. Let&#8217;s take a look closer at the highlighted areas:</p><ol><li><p>This filters the available intrinsics based on your target ISA. You can either pick &#8220;families&#8221; of ISA, or specific ones if you need.</p></li><li><p>Here we can see which header file the intrinsic is defined in, what x86/x64 instruction the compiler will generate and finally the <code>cpuid</code> flag required.</p></li><li><p>Performance metrics for Intel CPUs. This is very valuable to get a rough estimate of what kind of performance you can expect. You might already know what Latency &amp; Throughput is, but just as a quick primer: Latency is how long it takes for the instruction to complete from the time it was issued. Latency of 4 clock cycles means it will take 4 cycles for it to complete. Throughput is how often we can issue the instruction. This is often measured in reciprocal throughput (although people usually don&#8217;t clarify if they are talking about reciprocal or not which can be confusing). A reciprocal throughput of 1 means we can issue the same instruction 1 cycle after the previous one, as long as the second doesn&#8217;t depend on the result of the first one. Reciprocal throughput of 0.33 means we can issue 3 of the same instruction on the same clock cycle. If we want to issue two float multiply instructions that have latency of 4 and throughput of 1, the total time for this to complete is 5 cycles provided they don&#8217;t depend on each other:</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!1f2n!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!1f2n!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 424w, https://substackcdn.com/image/fetch/$s_!1f2n!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 848w, https://substackcdn.com/image/fetch/$s_!1f2n!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 1272w, https://substackcdn.com/image/fetch/$s_!1f2n!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!1f2n!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png" width="541" height="201" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:201,&quot;width&quot;:541,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!1f2n!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 424w, https://substackcdn.com/image/fetch/$s_!1f2n!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 848w, https://substackcdn.com/image/fetch/$s_!1f2n!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 1272w, https://substackcdn.com/image/fetch/$s_!1f2n!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5ebb9f34-10f2-4d32-8b0a-a82c8a139337_541x201.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a><figcaption class="image-caption">Pipelining of two non dependent float MULs</figcaption></figure></div></li></ol><p>You&#8217;ll notice the <code>mm</code> prefix for the intrinsics. This stands for &#8220;multimedia&#8221; which is just a legacy term that we are still stuck with. But I digress, what you might notice is how verbose these intrinsics are, it&#8217;s almost like you are programming in Assembly. This is by design since the intrinsics are supposed to map 1:1 with whatever x86/x64 instruction it&#8217;s meant to generate. Sometimes, the Intel guide will say that an intrinsic is a &#8220;Sequence&#8221;, which means that it will generate multiple instructions. </p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!UUdG!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!UUdG!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 424w, https://substackcdn.com/image/fetch/$s_!UUdG!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 848w, https://substackcdn.com/image/fetch/$s_!UUdG!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 1272w, https://substackcdn.com/image/fetch/$s_!UUdG!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!UUdG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png" width="364" height="247.74869109947645" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:260,&quot;width&quot;:382,&quot;resizeWidth&quot;:364,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!UUdG!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 424w, https://substackcdn.com/image/fetch/$s_!UUdG!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 848w, https://substackcdn.com/image/fetch/$s_!UUdG!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 1272w, https://substackcdn.com/image/fetch/$s_!UUdG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F72b614bd-2cb3-4c03-b0a5-fe9f9bcd2670_382x260.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Image courtesy of Intel&#174;</figcaption></figure></div><p>It&#8217;s quite common to write a SIMD wrapper library that can make programming with SIMD a bit more enjoyable. But you need to be careful to not over-abstract your wrappers. When working with performance oriented code you want to keep abstractions to a minimum and eliminate uncertainties since all they do is get in your way and make it very hard to reason about your code.</p><h4>Compiler Auto-Vectorization</h4><p>The second option for SIMD, is to instead rely on the compiler to generate the SIMD instructions for us by &#8220;auto-vectorizing&#8221;. In reality though this should really be used in conjunction with explicit SIMD, as this is not a good option for reliably generating optimal SIMD code. Compilers are not magic &#8212; it&#8217;s just a tool that has to follow strict rules for what it can and can&#8217;t do. You have to <strong>play the compiler</strong> to have it generate the code you want. As a result, it&#8217;s very easy to one day have working auto-vectorized code, but then break it by just making a simple change to the code. </p><p>A very common restriction to compiler auto-vectorization is aliasing. Look at this very simple function:</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!zEDJ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!zEDJ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 424w, https://substackcdn.com/image/fetch/$s_!zEDJ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 848w, https://substackcdn.com/image/fetch/$s_!zEDJ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 1272w, https://substackcdn.com/image/fetch/$s_!zEDJ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!zEDJ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png" width="531" height="191.52307692307693" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:211,&quot;width&quot;:585,&quot;resizeWidth&quot;:531,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!zEDJ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 424w, https://substackcdn.com/image/fetch/$s_!zEDJ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 848w, https://substackcdn.com/image/fetch/$s_!zEDJ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 1272w, https://substackcdn.com/image/fetch/$s_!zEDJ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2a2198c5-08f8-4f04-983b-14efef05af17_585x211.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>You would expect the compiler to easily be able to generate SIMD code. But it turns out it can&#8217;t in a lot of cases. Because the pointer <code>dst</code>, <code>a</code> and <code>b</code> could all be the same, which means this code could break. A simple fix would be to add the <code>__restrict</code> keyword next to the pointers, which tells the compiler that the pointers are guaranteed to point to different places (but that doesn&#8217;t stop the programmer from still passing in the same pointer&#8230; ). However, the latest MSVC compilers will still auto-vectorize this, but it has insert additional checks to make sure the pointers are not aliasing.</p><p>There are plenty of restrictions to what can and can&#8217;t be vectorized, but at the end of the day the compiler must ensure the correct result over optimizations (unless you have undefined behavior). The key takeaway here is that you should always check the assembly output of your routine to make sure the compiler did the right thing, and I would personally <strong>never</strong> rely on auto-vectorization for parts of the program that are very important.</p><p>Auto-vectorization is usually enabled by default when compiling optimized builds. MSVC will try to vectorize your code when compiling with level <code>/O2</code>. <em>I think this is also the case when compiling with Clang, but don&#8217;t quote me on this.</em></p><h4>Intel&#8217;s ISPC compiler</h4><p>The third option is to use <a href="https://ispc.github.io/">Intel ISPC</a><a class="footnote-anchor" data-component-name="FootnoteAnchorToDOM" id="footnote-anchor-6" href="#footnote-6" target="_self">6</a>. This is a custom compiler that accepts shader-like code (similar to that of GLSL or HLSL) and automatically compiles it to vectorized assembly code. You can then link these functions inside your C/C++ code like a normal function. </p><p>What I like about this is that you can write code in a serial fashion like you normally would. It also automatically picks the most optimal version to run based on what ISA is supported. When writing explicit SIMD with intrinsics we would have to maintain separate code paths depending on if we want to run the SSE, AVX or AVX-512 version, which adds complexity and maintenance costs.</p><p>I have not used ISPC too much so I can&#8217;t really comment more about it, but I like the idea of ISPC.</p><p>If you use Unreal Engine, ISPC is already integrated there and is used extensively in the engine.</p><h4>Summary of Pros&#8212;Cons to each approach</h4><p>Let's start with explicit SIMD. What I like about this is that it gives you full control over the generated code since the intrinsics often map directly to a single instruction. That way it makes it possible to make back-of-the-envelope calculations as to what kind of performance to expect. The control also allows you to make further optimizations by utilizing tricks, especially around floating point math that a compiler might not be allowed to do. Since you understand your program and the problem you are trying to solve better, you can write the best possible code yourself. However when working on large teams where the programmers have varying experience levels, SIMD code can be unintuitive, so only a select few might be able to maintain that code. Another disadvantage is when you want to support multiple ISAs. Supporting multiple ISAs means you need to write multiple versions of the same routine but at different widths (128, 256, 512, etc.). Unfortunately, this means that some applications only stick to supporting the min-spec ISA such as SSE and leave tons of performance on the table for users with AVX support (looking at you here Unreal Engine&#8230;). </p><p>I think the pros and cons of auto-vectorization are quite obvious. Not having to think about SIMD and just have the compiler magically take care of things is nice, and might lead to speed improvements in areas that you didn&#8217;t expect it. But as I&#8217;ve already mentioned you cannot rely on auto-vectorization for performance critical areas. You have to constantly check the assembly output to ensure the compiler did the right thing, because your code will silently break otherwise. Definitely let the compiler auto-vectorize code, but it needs to be combined with either explicit SIMD or ISPC.</p><p>When it comes to ISPC, as I&#8217;ve mentioned I don&#8217;t have tons of experience with it. But I can see the problems it&#8217;s trying to solve and I like what I see. The ability to just write code once and have it select the most optimal version based on the machines ISA support sounds very attractive. It would solve the two top complaints I have about explicit SIMD: better maintainability and support for multiple ISAs. Again, I haven&#8217;t used ISPC much but the only downside I can see is potential compiler bugs and it&#8217;s another layer of abstraction. More abstractions mean less control, which in turn may lead to worse performance. </p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://dennisrants.substack.com/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Dennis's Rants is supported by my readers. Please consider becoming a free or paid subscriber</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><div><hr></div><h2>Using SIMD</h2><p>Alright, now that we know how to use SIMD let&#8217;s start coding. I&#8217;ll be writing SIMD using intrinsics since that&#8217;s what I&#8217;m most familiar with, and I recommend that you do the same as well when starting out. If you later choose to use an abstraction layer then the extra experience you&#8217;ve gained using intrinsics won&#8217;t hurt you.</p><h4>The Na&#239;ve Vector Dot Product</h4><p>A very common first problem to convert to SIMD is a 32-bit float 3-vector dot product: <code>Ax*Bx+Ay*By+Az*Bz</code> it&#8217;s used everywhere (especially in games) and the operations are straight forward. We have 3x MULs and 2x ADDs. So we start with the multiplication and immediately run into a problem: the minimum lane width is 128-bit, but we only have 96-bits of data. One solution is to extend the vector struct by adding a 4th <code>W </code>component but this is a really large change which may have adverse affects on your codebase. Alternatively you can have a local variable that has the W component and copy the X,Y,Z components. The other option is to just include whatever data is at the end of the vector and treat the result as garbage. This is what I&#8217;m opting for here <em>(spoiler: this can be a bad idea)</em>:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!UAPM!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!UAPM!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 424w, https://substackcdn.com/image/fetch/$s_!UAPM!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 848w, https://substackcdn.com/image/fetch/$s_!UAPM!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 1272w, https://substackcdn.com/image/fetch/$s_!UAPM!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!UAPM!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png" width="593" height="497.4768740031898" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:526,&quot;width&quot;:627,&quot;resizeWidth&quot;:593,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!UAPM!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 424w, https://substackcdn.com/image/fetch/$s_!UAPM!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 848w, https://substackcdn.com/image/fetch/$s_!UAPM!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 1272w, https://substackcdn.com/image/fetch/$s_!UAPM!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F225268db-3cbc-474a-bd7e-ba3d25ebc559_627x526.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Alright now we&#8217;ve got the 3x MULs sorted, let&#8217;s do the additions. And this is where you run into the biggest problem: we have to operate horizontally for the adds. If you remember in the intro section, we ideally want to stay vertically with SIMD since that&#8217;s what it&#8217;s designed to do &#8212; there is not much support for working horizontally. Nevertheless, how do we solve this?</p><p>Let&#8217;s look at some solutions other people have come up with to this problem. Here is one answer from <a href="https://stackoverflow.com/questions/4120681/how-to-calculate-single-vector-dot-product-using-sse-intrinsic-functions-in-c">Stackoverflow</a> suggesting to use Shuffles/Swizzles. If we dig into the <a href="https://github.com/EpicGames/UnrealEngine/blob/5.3.2-release/Engine/Source/Runtime/Core/Public/Math/UnrealMathSSE.h#L1576">Unreal Engine source code</a>, we also find a <code>VectorDot4</code> function that uses Swizzles, so this seems like the right approach, right? The idea is to have 3 copies of the multiplied result, then use swizzles to align the X, Y and Z components to the same lane: </p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!osGk!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!osGk!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 424w, https://substackcdn.com/image/fetch/$s_!osGk!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 848w, https://substackcdn.com/image/fetch/$s_!osGk!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 1272w, https://substackcdn.com/image/fetch/$s_!osGk!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!osGk!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png" width="605" height="208.6206896551724" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/e3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:250,&quot;width&quot;:725,&quot;resizeWidth&quot;:605,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!osGk!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 424w, https://substackcdn.com/image/fetch/$s_!osGk!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 848w, https://substackcdn.com/image/fetch/$s_!osGk!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 1272w, https://substackcdn.com/image/fetch/$s_!osGk!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe3e7afd9-286a-4575-9a89-62cc84b27645_725x250.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>Then we can go vertical again with the adds, and finally we will have the dot product result in that lane.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!_7FC!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!_7FC!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 424w, https://substackcdn.com/image/fetch/$s_!_7FC!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 848w, https://substackcdn.com/image/fetch/$s_!_7FC!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 1272w, https://substackcdn.com/image/fetch/$s_!_7FC!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!_7FC!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png" width="666" height="324.46153846153845" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:361,&quot;width&quot;:741,&quot;resizeWidth&quot;:666,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!_7FC!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 424w, https://substackcdn.com/image/fetch/$s_!_7FC!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 848w, https://substackcdn.com/image/fetch/$s_!_7FC!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 1272w, https://substackcdn.com/image/fetch/$s_!_7FC!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0289a61d-fed5-4699-9a55-a04c2c2b3dee_741x361.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Alright great! I&#8217;m sure you&#8217;re excited to see some performance numbers. So let&#8217;s write a quick benchmark that runs our routine on 1024 pairs of random vectors. <em>The vectors are laid out contiguous and memory access is linear in a tight loop. The cache has also been warmed up.</em></p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Qiz7!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Qiz7!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 424w, https://substackcdn.com/image/fetch/$s_!Qiz7!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 848w, https://substackcdn.com/image/fetch/$s_!Qiz7!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 1272w, https://substackcdn.com/image/fetch/$s_!Qiz7!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Qiz7!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png" width="407" height="168.36807817589576" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/cf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:127,&quot;width&quot;:307,&quot;resizeWidth&quot;:407,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Qiz7!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 424w, https://substackcdn.com/image/fetch/$s_!Qiz7!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 848w, https://substackcdn.com/image/fetch/$s_!Qiz7!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 1272w, https://substackcdn.com/image/fetch/$s_!Qiz7!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcf21f5ad-aa9a-4a00-bae1-35ea3f7ef049_307x127.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>Alright, we get a little bit of speed gains compared to running a dot product function that isn&#8217;t using SIMD. But to be honest, this result is a bit disappointing. I&#8217;ve claimed multiple times in this post that we can expect 4x, 8x or even 16x speed improvements when using SIMD, so what&#8217;s wrong?</p><p>We are supposed to be processing multiple things at once &#8212; it&#8217;s in the name &#8220;Single Instruction Multiple Data&#8221;. But what we are essentially doing here is processing a single thing at once. Our mental model is <em>&#8220;optimize this one single DotProduct&#8221;</em>, when we should to be thinking <em>&#8220;optimize multiple DotProducts&#8220;</em>. We want to optimize for throughput &#8212; not latency. When using SIMD we have to change the way we think about our problems. </p><p>If we only have a single dot product to calculate then SIMD is probably not the right tool for this problem. But you almost certainly have multiple things that need to be processed, which is especially true in games. In our case we have 1024 dot products to calculate &#8212; so let&#8217;s process them in batches of 4 instead.</p><h4>Vector Dot Product the Right Way</h4><p>Now that we have rephrased our problem, let&#8217;s implement the correct solution. Starting with the input data, the dot products. Our data layout for the dot product has been a regular 3-wide vector. This is known as Array-of-Structure (AoS) data layout. But in order to process 4 different dot products, what we need is a Structure-of-Array (SoA) layout.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!8obx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!8obx!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 424w, https://substackcdn.com/image/fetch/$s_!8obx!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 848w, https://substackcdn.com/image/fetch/$s_!8obx!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 1272w, https://substackcdn.com/image/fetch/$s_!8obx!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!8obx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png" width="622" height="270.7529411764706" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:296,&quot;width&quot;:680,&quot;resizeWidth&quot;:622,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!8obx!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 424w, https://substackcdn.com/image/fetch/$s_!8obx!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 848w, https://substackcdn.com/image/fetch/$s_!8obx!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 1272w, https://substackcdn.com/image/fetch/$s_!8obx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1cf288e3-d7ba-4c82-a6e1-8311e263f1df_680x296.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Array-of-Structure vs Structure-of-Array data layout</figcaption></figure></div><p>As you can see with the SoA layout the data is naturally organized into the correct lanes that we want. The X, Y and Z components are already vertically aligned so there is no need for us to swizzle the data &#8212; as a result the code becomes very straight forward:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!L18m!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!L18m!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 424w, https://substackcdn.com/image/fetch/$s_!L18m!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 848w, https://substackcdn.com/image/fetch/$s_!L18m!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 1272w, https://substackcdn.com/image/fetch/$s_!L18m!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!L18m!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png" width="640" height="391.7525773195876" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:475,&quot;width&quot;:776,&quot;resizeWidth&quot;:640,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!L18m!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 424w, https://substackcdn.com/image/fetch/$s_!L18m!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 848w, https://substackcdn.com/image/fetch/$s_!L18m!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 1272w, https://substackcdn.com/image/fetch/$s_!L18m!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F872f1c40-3948-4490-bc69-72c7fd308b47_776x475.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>When working with SoA data the routine naturally folds down in to SIMD. There is no trickery here, we have 3x MULs and 2x ADDs which is exactly how you would write a scalar dot product function without SIMD. Anyone can understand what this code does and how it works. </p><p>Now the moment of truth, here are the performance numbers:</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!DRSV!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!DRSV!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 424w, https://substackcdn.com/image/fetch/$s_!DRSV!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 848w, https://substackcdn.com/image/fetch/$s_!DRSV!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 1272w, https://substackcdn.com/image/fetch/$s_!DRSV!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!DRSV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png" width="392" height="209.42465753424656" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/8709ac23-a00b-4395-a372-ba54eece7849_292x156.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:156,&quot;width&quot;:292,&quot;resizeWidth&quot;:392,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!DRSV!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 424w, https://substackcdn.com/image/fetch/$s_!DRSV!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 848w, https://substackcdn.com/image/fetch/$s_!DRSV!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 1272w, https://substackcdn.com/image/fetch/$s_!DRSV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8709ac23-a00b-4395-a372-ba54eece7849_292x156.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>Close to a 4x speedup using this approach. These are the kinds of results we should expect from using SIMD. Because remember, the latency &amp; throughput of a scalar and the equivalent vector instruction is usually the same on modern hardware. Thus if we are not getting 4x, 8x or 16x speedups then this is typically a sign that something is wrong or we are not optimizing for maximum throughput.</p><p>I&#8217;ll also show a DotProduct8 routine that processes 8 individual dot products at once using AVX2, which is 256-bit wide.</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Izh3!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Izh3!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 424w, https://substackcdn.com/image/fetch/$s_!Izh3!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 848w, https://substackcdn.com/image/fetch/$s_!Izh3!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 1272w, https://substackcdn.com/image/fetch/$s_!Izh3!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Izh3!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png" width="371" height="246.8932384341637" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/e7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:187,&quot;width&quot;:281,&quot;resizeWidth&quot;:371,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Izh3!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 424w, https://substackcdn.com/image/fetch/$s_!Izh3!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 848w, https://substackcdn.com/image/fetch/$s_!Izh3!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 1272w, https://substackcdn.com/image/fetch/$s_!Izh3!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fe7951531-bdb1-42b6-8f01-a487aa5686a5_281x187.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>I expected a 8x speedup but it turns out we exceeded that and are able to run at 11x! The key takeaway here is that this approach is scalable, whereas the previous approach isn&#8217;t.</p><div><hr></div><h2>Tips for SIMD</h2><p>I&#8217;d like to share some tips and tricks that I like to use when working with SIMD.</p><h4>Handling Branches</h4><p>Control flow is usually a bit tricky with SIMD. Since we are working with multiple elements at once that means we can have <code>N</code> possible branches that need to be taken. If all elements have the same condition then there is no problem since they all take the same path and the program can continue as normal. </p><p>Using <code>if</code> statements in SIMD code is usually a code smell. An unpredictable/random branch will incur a 10-20 cycle stall (depends on your CPU) which can easily be more cycles than your entire SIMD routine. This is why it&#8217;s better to not have branches in the first place.</p><p>Sometimes you may go faster by doing more work, in our case that may be evaluating both sides of a branch then selecting the correct one, which is similar to what GPUs do. I like this approach because the performance is very predictable:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!nYTQ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!nYTQ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 424w, https://substackcdn.com/image/fetch/$s_!nYTQ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 848w, https://substackcdn.com/image/fetch/$s_!nYTQ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 1272w, https://substackcdn.com/image/fetch/$s_!nYTQ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!nYTQ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png" width="618" height="298.28323699421964" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/a24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:334,&quot;width&quot;:692,&quot;resizeWidth&quot;:618,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!nYTQ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 424w, https://substackcdn.com/image/fetch/$s_!nYTQ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 848w, https://substackcdn.com/image/fetch/$s_!nYTQ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 1272w, https://substackcdn.com/image/fetch/$s_!nYTQ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa24e5318-15ac-4867-819d-6d3dc95d9d3f_692x334.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This might not always be a good approach &#8212; if the two sides require heavy computation then the select style can become too expensive. In cases like this I like to first sort the input data in to buckets, then run the <code>if/else</code> routines for the corresponding bucket.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!zcpD!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!zcpD!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 424w, https://substackcdn.com/image/fetch/$s_!zcpD!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 848w, https://substackcdn.com/image/fetch/$s_!zcpD!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 1272w, https://substackcdn.com/image/fetch/$s_!zcpD!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!zcpD!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png" width="624" height="379.7916137229987" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/a4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:479,&quot;width&quot;:787,&quot;resizeWidth&quot;:624,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!zcpD!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 424w, https://substackcdn.com/image/fetch/$s_!zcpD!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 848w, https://substackcdn.com/image/fetch/$s_!zcpD!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 1272w, https://substackcdn.com/image/fetch/$s_!zcpD!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa4c4fdb6-2ab0-4840-bd96-8c2202412aff_787x479.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>In this case I have some input data which depending on a condition I want to either run <code>ExpensiveFunc1</code> or <code>ExpensiveFunc2</code>. First I run a Sorting function that either copies the data from <code>inout</code> to <code>bucket1</code> or <code>bucket2</code> depending on the condition we are testing. We also maintain a list of indices which tells if we put the data in the first or second bucket. Then we are ready to run our main routines, one for each bucket. Once we are done we have to copy the data from the buckets back into the <code>inout</code> result. It basically just iterates over the indices we kept and copies either <code>bucket1</code> or <code>bucket2</code> into <code>inout</code>.</p><p>This means we have to maintain copies of the same data, but that&#8217;s okay. If we want speed we are happy to trade a bit of storage space. Another thing to note is that we could fold the <code>WriteBack</code> function into the main expensive function routines. Since the <code>WriteBack</code> is going to be memory move bound, we might be able to get that for free if <code>ExpensiveFunc1/2</code> are heavy on compute. This highly depends on what exactly those functions are doing, so you need to experiment with the results. This is probably another topic that deserves a dedicated post&#8230;.</p><p>I would also look in to &#8220;Parallel Prefix Sums&#8221; which can be very handy when doing this sort of stuff.</p><h4>Uneven Input Data</h4><p>It&#8217;s fairly common to have input data that are not at the correct stride for our SIMD routines, say we only have 15 elements but our routine requires 16 elements. We already looked at this in the DotProduct example, but one way to deal with this is to just ignore the last element by still performing the operation, but then discard the final value. </p><p>Though when doing this we have to make sure that the memory at the end is valid. It is possible that the data spans across multiple virtual page boundaries, and if one of those pages is not mapped to physical memory or if we don&#8217;t have permissions to write to that page then our program will fault. </p><p>Alternatively, we can make sure that our input data is always padded so that it&#8217;s evenly aligned to whatever stride we are working with. If we are allocating arrays for some entities then I would round up the entity count and pad the end:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!MoWG!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!MoWG!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 424w, https://substackcdn.com/image/fetch/$s_!MoWG!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 848w, https://substackcdn.com/image/fetch/$s_!MoWG!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 1272w, https://substackcdn.com/image/fetch/$s_!MoWG!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!MoWG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png" width="642" height="751.3470967741936" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/ccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:907,&quot;width&quot;:775,&quot;resizeWidth&quot;:642,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!MoWG!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 424w, https://substackcdn.com/image/fetch/$s_!MoWG!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 848w, https://substackcdn.com/image/fetch/$s_!MoWG!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 1272w, https://substackcdn.com/image/fetch/$s_!MoWG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fccf0a21f-6088-483f-82ab-6eec0a6f82d8_775x907.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Now the  <code>Move</code> routine doesn&#8217;t need to handle any special cases for when <code>count</code> is less than 4. The data at the end will just contain some garbage, but that&#8217;s okay since we will never use that data. At worst we are just wasting a couple of bytes of memory.</p><h4>Use maximum stride when targeting different widths</h4><p>We need to properly handle targeting multiple ISAs, a common one is having 128-bit SSE as min-spec but also optionally supporting 256-bit AVX. In this case I would ensure that the input data is always aligned to a 256-bit stride since that is the larger of the two. We can then run a single AVX routine, but we will also be able to run SSE on that &#8212; we just have to run it twice instead:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!D_zI!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!D_zI!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 424w, https://substackcdn.com/image/fetch/$s_!D_zI!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 848w, https://substackcdn.com/image/fetch/$s_!D_zI!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 1272w, https://substackcdn.com/image/fetch/$s_!D_zI!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!D_zI!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png" width="662" height="691.6758620689656" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/c56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:909,&quot;width&quot;:870,&quot;resizeWidth&quot;:662,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!D_zI!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 424w, https://substackcdn.com/image/fetch/$s_!D_zI!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 848w, https://substackcdn.com/image/fetch/$s_!D_zI!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 1272w, https://substackcdn.com/image/fetch/$s_!D_zI!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc56509a3-28bc-44fe-a0a7-b5b8f48dedd3_870x909.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>So far I&#8217;ve found this to be the only reasonable way of handling multiple widths without ISPC. Obviously if we were to do the opposite and align to 128-bit stride while targeting AVX, then this wouldn&#8217;t work. This same principle can be applied to AVX-512, but then we&#8217;d instead have to align to 512-bit strides instead of 256-bit. I think it could make sense to implement some kind of abstraction here.</p><h4>Quick note on Alignment</h4><p>When moving data to and from SIMD registers, you have an option of using aligned or unaligned loads/sores, such as:</p><pre><code>_mm_load_ps(...) // load 16-bytes at an aligned address
_mm_loadu_ps(...) // load 16-bytes at an unaligned address</code></pre><p>If you are going to use the aligned version, you obviously need to make sure that the address is aligned to a byte boundary equal to the register size (16-bytes for SSE, 32-bytes for AVX, etc.) If you don&#8217;t do this then your program will fault and crash.</p><p>In terms of performance, the aligned version is theoretically faster. Unaligned data may span multiple cache-lines &#8212; which means we have to load more cache-lines, and the load unit has to then merge from multiple load buffers.</p><p>However, in practice alignment is usually not a big problem. Because modern CPUs are heavily pipelined there is probably some other work the CPU can do while it deals with the unaligned penalty. It is a problem when you need maximum bandwidth and you aren&#8217;t doing much computation.</p><p>For this reason I typically use the unaligned load versions of the intrinsics and I rarely notice the performance penalty.</p><div><hr></div><h2>Writing &#8216;Optimizable&#8217; code</h2><p>In my opinion writing the actual SIMD code is not the difficult part of SIMD programming, where people often get stuck is typically at the data layout stage. If the data in our program is not structured correctly then there is not much we can do. </p><p>This is why it&#8217;s very important to keep SIMD and optimization in mind early on when designing a system. I don&#8217;t recommend to immediately start writing SIMD or optimizing code when first starting to implement a system, since that&#8217;s probably a waste of time. But you need to look ahead and see how you might be able to optimize the system in the future. If you don&#8217;t do this then you run a very real risk of not being able to SIMDize your code at all, since by the time you eventually start thinking about SIMD the system might be too complex, which might require a large refactor or even a rewrite of the entire system. As a result, the optimization might never happen. </p><p>Thinking about your data layout might also open the opportunity for other optimizations, such as multithreading and ensuring that caching behavior is ideal. To be honest with you, writing SIMD is usually the last step of my optimization process. Ensuring that your algorithm has the correct time-complexity and that the data is laid out in a way that&#8217;s ideal for the CPU cache then most code will run fast enough for you to not care. If I need it to run faster then the next step is to optionally move to Multithreading or SIMD, but again, the prerequisite is that we have architected the code in a way that allows us to do that.</p><p>Death by a thousand cuts is a very real thing, and the idea that you can optimize a program by <a href="https://ricomariani.medium.com/hotspots-premature-optimization-hoares-maxim-f87590a6c26a">fixing a few &#8220;hotspots&#8221; is a fallacy</a><a class="footnote-anchor" data-component-name="FootnoteAnchorToDOM" id="footnote-anchor-7" href="#footnote-7" target="_self">7</a>. I can&#8217;t stress enough how important it is to just keep performance in mind early on.</p><div><hr></div><h2>Final Notes</h2><p>Alright, I think I&#8217;ve about covered everything I want to. This is my first article so any feedback or comments would be appreciated. </p><p>I plan on doing in more of these, next on my list is probably talking about CPU caches or some of my Unreal Engine experiments.</p><p>But let me know what you&#8217;d like to see next!</p><p>~ Dennis</p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://dennisrants.substack.com/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Dennis's Rants is supported by my readers. Please consider becoming a free or paid subscriber</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><div class="footnote" data-component-name="FootnoteToDOM"><a id="footnote-1" href="#footnote-anchor-1" class="footnote-number" contenteditable="false" target="_self">1</a><div class="footnote-content"><p><a href="https://www.flickr.com/people/130561288@N04/">AMD &#8220;Zen 1&#8221; DIE shot by Fritzchen Fritz</a></p></div></div><div class="footnote" data-component-name="FootnoteToDOM"><a id="footnote-2" href="#footnote-anchor-2" class="footnote-number" contenteditable="false" target="_self">2</a><div class="footnote-content"><p><a href="https://en.wikipedia.org/wiki/CPUID">cpudid instruction on Wikipedia</a></p></div></div><div class="footnote" data-component-name="FootnoteToDOM"><a id="footnote-3" href="#footnote-anchor-3" class="footnote-number" contenteditable="false" target="_self">3</a><div class="footnote-content"><p><a href="https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex">__cpuid and __cpuidex intrinsics for MSVC/Windows</a></p></div></div><div class="footnote" data-component-name="FootnoteToDOM"><a id="footnote-4" href="#footnote-anchor-4" class="footnote-number" contenteditable="false" target="_self">4</a><div class="footnote-content"><p><a href="https://store.steampowered.com/hwsurvey/Steam-Hardware-Software-Survey">Steam hardware survey with SIMD support</a></p></div></div><div class="footnote" data-component-name="FootnoteToDOM"><a id="footnote-5" href="#footnote-anchor-5" class="footnote-number" contenteditable="false" target="_self">5</a><div class="footnote-content"><p><a href="https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html">Intel&#174; Intrinsics Guide</a></p></div></div><div class="footnote" data-component-name="FootnoteToDOM"><a id="footnote-6" href="#footnote-anchor-6" class="footnote-number" contenteditable="false" target="_self">6</a><div class="footnote-content"><p><a href="https://ispc.github.io/">Intel</a><a href="https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html">&#174;</a><a href="https://ispc.github.io/"> ISPC compiler</a></p></div></div><div class="footnote" data-component-name="FootnoteToDOM"><a id="footnote-7" href="#footnote-anchor-7" class="footnote-number" contenteditable="false" target="_self">7</a><div class="footnote-content"><p><a href="https://ricomariani.medium.com/hotspots-premature-optimization-hoares-maxim-f87590a6c26a">Hotspots, Premature Optimization, Hoare&#8217;s Maxim</a></p></div></div>]]></content:encoded></item><item><title><![CDATA[Coming soon]]></title><description><![CDATA[This is Dennis&#39;s Rants.]]></description><link>https://dennisrants.substack.com/p/coming-soon</link><guid isPermaLink="false">https://dennisrants.substack.com/p/coming-soon</guid><dc:creator><![CDATA[Dennis Andersson]]></dc:creator><pubDate>Thu, 16 May 2024 11:55:23 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!nfdc!,w_256,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8007a87a-3e29-42a6-a5a0-9ca395bb3960_250x250.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<p>This is Dennis&#39;s Rants.</p><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://dennisrants.substack.com/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe now&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://dennisrants.substack.com/subscribe?"><span>Subscribe now</span></a></p>]]></content:encoded></item></channel></rss>