Articles - SQLServerCentral https://www.sqlservercentral.com The #1 SQL Server community Mon, 09 Mar 2026 23:39:27 +0000 en-GB hourly 1 https://wordpress.org/?v=6.8.1 SQL Art 2: St Patrick’s Day in SSMS (Shamrock + Pint + Pixel Text) https://www.sqlservercentral.com/articles/sql-art-2-st-patricks-day-in-ssms-shamrock-pint-pixel-text Mon, 16 Mar 2026 00:00:52 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4739098 Last time we turned SQL Server into a Christmas tree. Every year around St Patrick's Day I find myself doing something I can't fully justify. This year that thing was using SQL Server's spatial viewer to draw a shamrock and a pint of Guinness. Hope you have a happy St Patrick's Day and a few […]

The post SQL Art 2: St Patrick’s Day in SSMS (Shamrock + Pint + Pixel Text) appeared first on SQLServerCentral.

]]>
Last time we turned SQL Server into a Christmas tree.

Every year around St Patrick's Day I find myself doing something I can't fully justify. This year that thing was using SQL Server's spatial viewer to draw a shamrock and a pint of Guinness. Hope you have a happy St Patrick's Day and a few Guinness. Preferably not while writing geometry.

What you’ll get

  • A shamrock built from circles and a rectangle
  • A pint glass built from polygons
  • ‘HAPPY ST PATRICKS DAY’ in a chunky 5x7 pixel font
  • A small hack to bully SSMS into putting the layers in the right order

Run it

  1. Open SSMS
  2. Paste the script below into a new query window
  3. Execute
  4. In the results grid, click the shape cell and open the Spatial Results tab

If you do not see Spatial Results, enable it in SSMS options.

The Script

USE tempdb;
GO

DROP TABLE IF EXISTS #stPatrick;
CREATE TABLE #stPatrick (shape geometry, colour varchar(20));

--------------------------------------------------------------------------------
-- 1) SHAMROCK
--------------------------------------------------------------------------------

INSERT INTO #stPatrick(shape, colour) VALUES
(geometry::Point(5.0, 8.2, 0).STBuffer(1.2).MakeValid(), 'darkgreen'),
(geometry::Point(4.0, 7.2, 0).STBuffer(1.2).MakeValid(), 'darkgreen'),
(geometry::Point(6.0, 7.2, 0).STBuffer(1.2).MakeValid(), 'darkgreen'),
(geometry::Point(5.0, 7.5, 0).STBuffer(0.9).MakeValid(), 'green'),
(geometry::STGeomFromText('POLYGON((4.75 4.2, 5.25 4.2, 5.25 6.8, 4.75 6.8, 4.75 4.2))', 0).MakeValid(), 'darkgreen'),
-- Dummy shape off-screen to absorb slot 4 (purple) — keeps it away from real shapes
(geometry::Point(-3, 5, 0).STBuffer(0.01).MakeValid(), 'dummy');

--------------------------------------------------------------------------------
-- 2) PIXEL FONT
--------------------------------------------------------------------------------

DECLARE @Scale decimal(6,2) = 0.18;
DECLARE @Step decimal(6,2) = (5 * @Scale) + @Scale;

DECLARE @Line1 nvarchar(60) = N'HAPPY ST';
DECLARE @Line2 nvarchar(60) = N'PATRICKS DAY';

DECLARE @X1 decimal(6,2) = -1.0, @Y1 decimal(6,2) = 1.8;
DECLARE @X2 decimal(6,2) = -1.8, @Y2 decimal(6,2) = 0.4;

DECLARE @Font TABLE (ch nchar(1), r tinyint, c tinyint);

INSERT @Font VALUES
-- H
(N'H',0,0),(N'H',0,4),(N'H',1,0),(N'H',1,4),(N'H',2,0),(N'H',2,4),
(N'H',3,0),(N'H',3,1),(N'H',3,2),(N'H',3,3),(N'H',3,4),
(N'H',4,0),(N'H',4,4),(N'H',5,0),(N'H',5,4),(N'H',6,0),(N'H',6,4),
-- A
(N'A',0,1),(N'A',0,2),(N'A',0,3),(N'A',1,0),(N'A',1,4),
(N'A',2,0),(N'A',2,4),(N'A',3,0),(N'A',3,1),(N'A',3,2),(N'A',3,3),(N'A',3,4),
(N'A',4,0),(N'A',4,4),(N'A',5,0),(N'A',5,4),(N'A',6,0),(N'A',6,4),
-- P
(N'P',0,0),(N'P',0,1),(N'P',0,2),(N'P',0,3),
(N'P',1,0),(N'P',1,4),(N'P',2,0),(N'P',2,4),
(N'P',3,0),(N'P',3,1),(N'P',3,2),(N'P',3,3),
(N'P',4,0),(N'P',5,0),(N'P',6,0),
-- Y
(N'Y',0,0),(N'Y',0,4),(N'Y',1,0),(N'Y',1,4),(N'Y',2,1),(N'Y',2,3),
(N'Y',3,2),(N'Y',4,2),(N'Y',5,2),(N'Y',6,2),
-- S
(N'S',0,1),(N'S',0,2),(N'S',0,3),(N'S',0,4),
(N'S',1,0),(N'S',2,0),
(N'S',3,1),(N'S',3,2),(N'S',3,3),
(N'S',4,4),(N'S',5,4),
(N'S',6,0),(N'S',6,1),(N'S',6,2),(N'S',6,3),
-- T
(N'T',0,0),(N'T',0,1),(N'T',0,2),(N'T',0,3),(N'T',0,4),
(N'T',1,2),(N'T',2,2),(N'T',3,2),(N'T',4,2),(N'T',5,2),(N'T',6,2),
-- R
(N'R',0,0),(N'R',0,1),(N'R',0,2),(N'R',0,3),
(N'R',1,0),(N'R',1,4),(N'R',2,0),(N'R',2,4),
(N'R',3,0),(N'R',3,1),(N'R',3,2),(N'R',3,3),
(N'R',4,0),(N'R',4,2),(N'R',5,0),(N'R',5,3),(N'R',6,0),(N'R',6,4),
-- I
(N'I',0,1),(N'I',0,2),(N'I',0,3),
(N'I',1,2),(N'I',2,2),(N'I',3,2),(N'I',4,2),(N'I',5,2),
(N'I',6,1),(N'I',6,2),(N'I',6,3),
-- C
(N'C',0,1),(N'C',0,2),(N'C',0,3),(N'C',0,4),
(N'C',1,0),(N'C',2,0),(N'C',3,0),(N'C',4,0),(N'C',5,0),
(N'C',6,1),(N'C',6,2),(N'C',6,3),(N'C',6,4),
-- K
(N'K',0,0),(N'K',0,4),(N'K',1,0),(N'K',1,3),
(N'K',2,0),(N'K',2,2),(N'K',3,0),(N'K',3,1),
(N'K',4,0),(N'K',4,2),(N'K',5,0),(N'K',5,3),(N'K',6,0),(N'K',6,4),
-- D
(N'D',0,0),(N'D',0,1),(N'D',0,2),(N'D',0,3),
(N'D',1,0),(N'D',1,4),(N'D',2,0),(N'D',2,4),
(N'D',3,0),(N'D',3,4),(N'D',4,0),(N'D',4,4),
(N'D',5,0),(N'D',5,4),(N'D',6,0),(N'D',6,1),(N'D',6,2),(N'D',6,3),
-- N
(N'N',0,0),(N'N',0,4),(N'N',1,0),(N'N',1,1),(N'N',1,4),
(N'N',2,0),(N'N',2,2),(N'N',2,4),(N'N',3,0),(N'N',3,3),(N'N',3,4),
(N'N',4,0),(N'N',4,4),(N'N',5,0),(N'N',5,4),(N'N',6,0),(N'N',6,4),
-- E
(N'E',0,0),(N'E',0,1),(N'E',0,2),(N'E',0,3),(N'E',0,4),
(N'E',1,0),(N'E',2,0),
(N'E',3,0),(N'E',3,1),(N'E',3,2),(N'E',3,3),
(N'E',4,0),(N'E',5,0),
(N'E',6,0),(N'E',6,1),(N'E',6,2),(N'E',6,3),(N'E',6,4);

DECLARE @i int, @len int, @ch nchar(1);

-- Line 1
SET @i = 1; SET @len = LEN(@Line1);
WHILE @i <= @len
BEGIN
SET @ch = SUBSTRING(@Line1, @i, 1);
IF @ch <> N' '
BEGIN
INSERT INTO #stPatrick(shape, colour)
SELECT geometry::STGeomFromText(
CONCAT('POLYGON((',
CAST(ROUND(@X1+((@i-1)*@Step)+(c *@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y1+((6-r) *@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X1+((@i-1)*@Step)+((c+1)*@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y1+((6-r) *@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X1+((@i-1)*@Step)+((c+1)*@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y1+((6-r+1)*@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X1+((@i-1)*@Step)+(c *@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y1+((6-r+1)*@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X1+((@i-1)*@Step)+(c *@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y1+((6-r) *@Scale),3) AS varchar(12)),
'))'), 0).MakeValid(),
'gold'
FROM @Font WHERE ch = @ch;
END
SET @i += 1;
END

-- Line 2
SET @i = 1; SET @len = LEN(@Line2);
WHILE @i <= @len
BEGIN
SET @ch = SUBSTRING(@Line2, @i, 1);
IF @ch <> N' '
BEGIN
INSERT INTO #stPatrick(shape, colour)
SELECT geometry::STGeomFromText(
CONCAT('POLYGON((',
CAST(ROUND(@X2+((@i-1)*@Step)+(c *@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y2+((6-r) *@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X2+((@i-1)*@Step)+((c+1)*@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y2+((6-r) *@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X2+((@i-1)*@Step)+((c+1)*@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y2+((6-r+1)*@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X2+((@i-1)*@Step)+(c *@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y2+((6-r+1)*@Scale),3) AS varchar(12)),',',
CAST(ROUND(@X2+((@i-1)*@Step)+(c *@Scale),3) AS varchar(12)),' ',
CAST(ROUND(@Y2+((6-r) *@Scale),3) AS varchar(12)),
'))'), 0).MakeValid(),
'gold'
FROM @Font WHERE ch = @ch;
END
SET @i += 1;
END

--------------------------------------------------------------------------------
-- 2b) PINT OF GUINNESS
--------------------------------------------------------------------------------

INSERT INTO #stPatrick(shape, colour) VALUES
-- Dark stout body
(geometry::STGeomFromText('POLYGON((8.2 4.2, 9.8 4.2, 10.1 8.5, 7.9 8.5, 8.2 4.2))', 0).MakeValid(), 'black'),
-- Creamy head
(geometry::STGeomFromText('POLYGON((7.9 8.5, 10.1 8.5, 10.2 9.5, 7.8 9.5, 7.9 8.5))', 0).MakeValid(), 'white'),
-- Glass outline left
(geometry::STGeomFromText('POLYGON((7.8 4.0, 8.1 4.0, 8.1 9.6, 7.8 9.6, 7.8 4.0))', 0).MakeValid(), 'lightgray'),
-- Glass outline right
(geometry::STGeomFromText('POLYGON((9.9 4.0, 10.2 4.0, 10.2 9.6, 9.9 9.6, 9.9 4.0))', 0).MakeValid(), 'lightgray'),
-- Glass bottom
(geometry::STGeomFromText('POLYGON((7.8 4.0, 10.2 4.0, 10.2 4.3, 7.8 4.3, 7.8 4.0))', 0).MakeValid(), 'lightgray'),
-- Glass top rim
(geometry::STGeomFromText('POLYGON((7.8 9.5, 10.2 9.5, 10.2 9.7, 7.8 9.7, 7.8 9.5))', 0).MakeValid(), 'lightgray');

--------------------------------------------------------------------------------
-- 4) Output — slot order carefully controlled
-- Slot 4 (purple) absorbed by dummy shape off-screen
--------------------------------------------------------------------------------

SELECT colour, geometry::UnionAggregate(shape) AS shape
FROM #stPatrick
WHERE shape IS NOT NULL
AND shape.STIsValid() = 1
GROUP BY colour
ORDER BY CASE colour
WHEN 'darkgreen' THEN 1 -- slot 1 (teal) ? shamrock
WHEN 'gold' THEN 2 -- slot 2 (orange) ? text
WHEN 'lightgray' THEN 3 -- slot 3 (beige) ? glass rim
WHEN 'dummy' THEN 4 -- slot 4 (PURPLE) ? off-screen dummy, never seen
WHEN 'black' THEN 5 -- slot 5 (dark grey) ? pint body ?
WHEN 'white' THEN 6 -- slot 6 (cream) ? pint head ?
WHEN 'green' THEN 7 -- slot 7 (sage) ? shamrock centre
ELSE 8
END;

DROP TABLE #stPatrick;

 

How it works

Let's describe each of the sections of the result.

Shamrock

Each leaf is just a buffered point: geometry::Point(x, y, 0).STBuffer(r) draws a circle

Three circles make the leaves, a smaller circle makes the centre, and a rectangle becomes the stem. Nature is beautiful.

The text

The 5x7 ‘font’ is a table of pixels. Each letter is just a list of row and column coordinates. The loops stamp small squares to build the message.

This is the part you definitely should not copy into production.

The pint

The stout body and creamy head are polygons stacked on top of each other. The glass outline is a few thin polygons to fake a rim and sides.

It’s not realistic. It’s recognisable, which is the only requirement for a St Patrick’s Day post.

The colour slot hack

SSMS doesn’t colour shapes based on your colour names. It assigns colours by row order in the results. So the ORDER BY CASE forces a consistent layer order, and the off-screen dummy shape exists purely to waste a colour slot you don’t want.

Yes, this is ridiculous. That’s the point.

Remix it in 30 seconds

  • Bigger letters: increase @Scale
  • Move the caption: adjust @X1/@Y1 and @X2/@Y2
  • Bigger shamrock: increase the leaf buffer sizes

If SSMS throws an ‘undefined’ error

SSMS can be dramatic with complex spatial results. If the Spatial Results tab crashes, swap:

  • geometry::UnionAggregate(shape)

for:

  • geometry::CollectionAggregate(shape)

Same idea, fewer tantrums.

If it works it should look like this. I ran this in SQL 2022 dev edition

The post SQL Art 2: St Patrick’s Day in SSMS (Shamrock + Pint + Pixel Text) appeared first on SQLServerCentral.

]]>
I’m Not Gonna Waste Time Debunking Crap on LinkedIn. https://www.sqlservercentral.com/articles/im-not-gonna-waste-time-debunking-crap-on-linkedin Mon, 16 Mar 2026 00:00:01 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4746324 LinkedIn is full of absolute trash these days. Just flat out b****** garbage. (Oh yeah, that – this post should probably come with a language disclaimer, because this stuff makes me mad.)

The post I’m Not Gonna Waste Time Debunking Crap on LinkedIn. appeared first on SQLServerCentral.

]]>
The post I’m Not Gonna Waste Time Debunking Crap on LinkedIn. appeared first on SQLServerCentral.

]]>
Webinar: Compliance Without Compromise: Test Data Management That Finally Fits https://www.sqlservercentral.com/articles/webinar-compliance-without-compromise-test-data-management-that-finally-fits Mon, 09 Mar 2026 00:00:11 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4744477 You know you shouldn't have production data in test environments. But every time you look at fixing it, the options feel impossible: enterprise tools that cost six figures and take months to implement, or DIY scripts that sort of work until they don't. Join this webinar on Mar 18 to learn more.

The post Webinar: Compliance Without Compromise: Test Data Management That Finally Fits appeared first on SQLServerCentral.

]]>
You know you shouldn't have production data in test environments. But every time you look at fixing it, the options feel impossible: enterprise tools that cost six figures and take months to implement, or DIY scripts that sort of work until they don't. 

The post Webinar: Compliance Without Compromise: Test Data Management That Finally Fits appeared first on SQLServerCentral.

]]>
How to Use sqlpackage to Detect Schema Drift Between Azure SQL Databases https://www.sqlservercentral.com/articles/how-to-use-sqlpackage-to-detect-schema-drift-between-azure-sql-databases Fri, 13 Mar 2026 00:00:39 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4715000 This article explains how I use sqlpackage to detect schema drift and generate a delta script that shows exactly what’s different.

The post How to Use sqlpackage to Detect Schema Drift Between Azure SQL Databases appeared first on SQLServerCentral.

]]>
Schema drift between environments is one of those problems that usually goes unnoticed, until a deployment fails or a production issue appears. In Azure SQL projects, even small differences in tables, indexes, or procedures can cause unexpected behavior.

Recently, I needed a simple, scriptable way to compare schemas between two Azure SQL databases without relying on Visual Studio or SSDT. I ended up using sqlpackage, and it worked surprisingly well for this purpose.

This article explains how I use sqlpackage to detect schema drift and generate a delta script that shows exactly what’s different.

Why I Avoid Manual Schema Comparison

In many teams, schema comparison still happens in one of these ways:

  • Manually inspecting database objects
  • Relying on ad-hoc scripts
  • Using GUI tools that are hard to automate

These approaches don’t scale well—especially when working with multiple Azure SQL environments (Dev, Test, Prod). I wanted something that was:

  • Lightweight
  • Scriptable
  • CI/CD friendly

That’s where sqlpackage fits nicely.

Installing sqlpackage Before You Begin

Before you can compare two Azure SQL database schemas using sqlpackage, you need to install the sqlpackage command-line utility on your machine. sqlpackage.exe is the same tool that Powers the DACPAC deployments discussed in this site’s previous article on deploying DACPACs to Azure SQL Database.

Here are the recommended ways to install sqlpackage, depending on your environment:

Option 1 — Install as a .NET Global Tool (Cross-Platform)

The easiest and most modern way to install sqlpackage on Windows, macOS, or Linux is using the .NET SDK tool installer. This gives you the sqlpackage command directly on your PATH. Run below command in command prompt.

dotnet tool install --global Microsoft.SqlPackage

If you already have it installed and want to update to the latest version, run:

dotnet tool update --global Microsoft.SqlPackage

This method works well for local development and automation scripts, including PowerShell and CI/CD pipelines.

Option 2 — Download Standalone Zip (Windows / macOS / Linux)

If you don’t use the .NET SDK or prefer a self-contained download, Microsoft also publishes standalone builds of sqlpackage:

  1. Visit the official SqlPackage download page on Microsoft Docs.
  2. Download the appropriate zip for your OS.
  3. Extract and place the folder somewhere on your system (e.g., C:\tools\sqlpackage).
  4. Add the folder containing sqlpackage.exe to your system PATH so you can call it from any terminal.

Using sqlpackage to compare databases

Instead of comparing live databases directly, I prefer working with DACPAC files. A DACPAC represents the schema of a database and makes comparisons predictable and repeatable.

At a high level, my workflow looks like this:

  1. Create a DACPAC from the source database (for example, Production)
  2. Compare it against a target database (for example, Test)
  3. Generate a delta script that highlights schema differences

Generating a Schema Difference Script

Once I have the DACPAC, I use sqlpackage with the Script action to generate a schema difference script.

Here’s a minimal example:

sqlpackage /Action:Script ^
/SourceFile:"Prod.dacpac" ^
/TargetServerName:"mytestserver.database.windows.net" ^
/TargetDatabaseName:"TestDB" ^
/OutputPath:"SchemaDiff.sql" ^
/p:DropObjectsNotInSource=False ^
/p:ExcludeSchemaValidation=True

What this does:

  • Compares the schema inside the DACPAC with the target database
  • Generates a SQL script that represents the differences
  • Clearly shows which objects would be added, altered, or dropped

I intentionally keep DropObjectsNotInSource set to False in most cases. This helps avoid accidentally dropping objects that may exist intentionally in the target environment.

How I Read the Delta Script

The generated script is very useful for spotting drift quickly. I typically look for:

  • Tables or columns that exist in one environment but not the other
  • Index changes that may impact performance
  • Stored procedure or function differences

Even without executing the script, it provides a clear picture of how far environments have diverged.

Common Mistakes I Ran Into

A few things worth noting from experience:

  • The target database name is mandatory for the Script action
  • Always review scripts before applying them to higher environments
  • This approach focuses on schema, not data differences

Once these are understood, the workflow becomes very reliable.

Where This Fits Best

I’ve found this approach especially useful for:

  • Pre-deployment validation
  • Detecting accidental schema changes
  • CI/CD pipelines where GUI tools aren’t available
  • Azure SQL environments managed by multiple teams

It’s simple, transparent, and easy to automate.

Final Thoughts

Using sqlpackage for schema comparison has helped me catch schema drift early and avoid surprises during deployments. It’s not a replacement for every database tool, but it’s a solid option when you need a lightweight, automatable way to understand schema differences across environments.

For teams working with Azure SQL and CI/CD pipelines, this approach provides a practical balance between control, visibility, and simplicity.

The post How to Use sqlpackage to Detect Schema Drift Between Azure SQL Databases appeared first on SQLServerCentral.

]]>
What’s New in SSMS 22 https://www.sqlservercentral.com/articles/whats-new-in-ssms-22 Fri, 13 Mar 2026 00:00:07 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4736386 SSMS 22 has several new features. Is there a compilation list of the new features? How will these features help me as a SQL Server professional?

The post What’s New in SSMS 22 appeared first on SQLServerCentral.

]]>
The post What’s New in SSMS 22 appeared first on SQLServerCentral.

]]>
Going Native with Fabric Spark Pools: The Fabric Modern Data Platform https://www.sqlservercentral.com/articles/going-native-with-fabric-spark-pools-the-fabric-modern-data-platform Wed, 11 Mar 2026 00:00:10 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4730720 The next installment of the Fabric Modern Data Platform looks at Spark pools for executing code.

The post Going Native with Fabric Spark Pools: The Fabric Modern Data Platform appeared first on SQLServerCentral.

]]>
The open-source version of Apache Spark is written in Scala.  The issue with Scala (Object Oriented Java) is that fact that it executes slower than natively compiled code.  Before there were language APIs for Spark, developers used to worry about resilient distributed datasets (RDDs) which are a read only sets of data items distributed over the computing nodes.  The Dataframe API eliminated the need for dealing with RDDs and the SQL language allows developers to use relational knowledge on dataframes stored as temporary views.

In short, the PySpark language has simplified the data engineering process.  How can we configure and tune the Fabric Spark Pool so that our programs execute faster on the same number of cores?

Business Problem

There are settings in the Fabric environment at the Tenant, Workspace and Notebook level that allow the administrator and/or developer to control how a PySpark program executes.  Today, we are going to cover each setting that can be changed to tune your Spark program.

We need a rather large parquet dataset to perform this analysis on.  The New York Yellow Cab dataset contains 1.5 Billon rows of data stored in 50 GB of parquet files.  A simple PySpark program will be used to read the data into a dataframe, print the number of rows, and display the contents of the top N most records.

Spark Pool Configuration

Starter pools are one reason Microsoft Fabric gained popularity.  Previously, a developer would have to wait two to five minutes for a cluster to start up before the job started execution.  Instead, these pre-hydrated pools owned by Microsoft come up in mere seconds if the conditions are right.

High concurrency sessions (clusters) support the idea that sharing is caring.  Yes, the initial startup of a custom pool might be slow.  However, additional workloads can re-use the cluster until it times out.  Thus, the end user might see up to thirty-six times faster startup notebooks after the first session loads.

Finally, Microsoft has caught up with Azure Databricks by supplying the developer with a natively compiled execution engine.  The Native Execution Engine (NEE) is based on  Velox, a C++ database acceleration library, and Gluten, a middle layer which offloads Spark SQL execution to native engines for speed.

The above image shows impressive speed improvements with the Spark engine.  Please note that all images are from Microsoft.

Spark Pool Configuration

We need to have a sample program to tune.  The following PySpark code has been identified as needing a faster execution time.  We will focus on tuning this code using various techniques.

#
#  1 - New York City - yellow cab drivers dataset
# 

# Storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
blob_sas_token = r""

# Set spark config for storage
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),  blob_sas_token )
print('Remote blob path: ' + wasbs_path)

# Lazy operation, read data + create view
df = spark.read.parquet(wasbs_path)
print('Register df as SQL view named yellow_cabs')
df.createOrReplaceTempView('yellow_cabs')

# Display top 10 rows
print(df.count())

# Grab top 5 rows ~ partial read of parquet files
display(spark.sql('SELECT * FROM yellow_cabs LIMIT 5'))

The capacity of the workspace is one limiting factor.  We can see that an F8 SKU is being used in the image below.  That means sixteen virtual cores can be used at most at one time.

If we look at the default starter pool, the workspace settings below show that we can have up to six medium nodes.

If we look up the nodes sizes, we can see a medium node has eight virtual cores.  If we do simple math, the capacity has sixteen virtual cores, but the Spark Session says we can uses forty-eight total cores in a six-node cluster.  How is this so?

This is where the concepts of bursting and throttling come into play.  Please see on line documentation for details.  We can bust up to three times the capacity.  However, there is no free lunch.  We will be throttled in the future to pay back the temporary scaling in computing power.  Please see the paragraph below for when starter pools might take a long time to start!  Typically, a starter pool connects to a standard session in seconds.

I ran into this same issue with a client when an ODBC driver had to be installed as part of the environment.  To recap, starter pools are impressive but there are some configurations that will take more time to start.

Standard Session with Bursting Enabled

The image below shows the execution of PySpark code.  If we did not have a count operation, we would just scan one random file and bring back ten records.  Instead, we need to read each parquet file and get the record count.  Job 15 shows it took almost five hundred rows or file counts to complete this task.

How many virtual cores did we use to accomplish these jobs?

 

We can see that up to twenty-four cores were allocated to execute this distributed code but only eight were used (running).  The graph is misleading since it shows a maximum of forty cores.  The reason for this limit is that one node is the driver that controls the job execution and that medium size node takes eight virtual cores.

Be careful that you do not run out of capacity when running multiple notebooks.

The above image of the monitor hub shows what is currently executing regardless of Fabric object type.  We have five notebooks running.  The sessions stay open until they time out.  If you receive an out of capacity message when executing a cell, check the monitor for open and forgotten activities.

We can change the default Spark settings job timeout from twenty minutes to five minutes so that forgotten sessions are automatically closed.  The above image shows the default setting on the jobs panel.

Standard Session with Bursting Disabled

The first task is to disable bursting.  Please go into the admin portal and find the capacity settings page.  Next, click the hyperlink of the capacity you want to configure.

Change the spark settings to not allow bursting.  Please note, if we want to force everyone to use pools that we declare, we can disable the customized workspace pools and disable starter pools.  This would require us to define pool for use by the end users.  Please see documentation for details.

Make sure you apply the settings to complete the task.  We can look at the workspace settings for data engineering and science.  We can see that the tenant level change affected the workspace settings.

One major change we can identify without executing the program is the drop down for the default workspace settings.  We can see only 1-2 nodes are available versus the 1-6 we had during the first execution.

There is only a few seconds difference between the two standard sessions.  The bulk of the processing is reading in all the rows to get a correct count.

The graph of the executors is different.  Since we have only eight virtual cores, we are fully using everything, and no scaling occurs with un-used but allocated cores.

Disabling bursting at the tenant level saved our boss money and did not decrease the total execution time.

High Concurrency Session

Lets take a look at two different sessions using the same starter pool (cluster).  The image below shows the “going native” notebook.

The image below shows the “ingest csv files” notebook.  They both share the same resources (Spark cluster).

The execution time remains the same.  We are just sharing resources.  Since I did not run them in parallel, there is no change.

Custom Pool Size

Another option that workspace owners have is the ability to create a custom spark pool size and set it as the workspace default.

A capacity with an F8 SKU has sixteen cores.  A small node size uses four cores.  Therefore, we can have a four-node cluster.  Let us see if this custom starter cluster executes the PySpark program any faster?

The execution time was shorter by 10 seconds.  Moving the count operation to the bottom of the cell might be the culprit.  However, the startup time was almost four minutes!  Remember, starter clusters have to have a medium node size.  Deviating from the starter requirements results in a custom cluster.

Native Execution Engine

To leverage the native execution engine, we must create an environment.  We can do this by hitting the new item (+) on the workspace and search for the “environment” key word.

Of course, we have to give the environment a name.

Typically, environments were used to load custom libraries, set computing configurations and/or apply spark settings.  Any of these changes does not allow the Spark session to use a starter pool.  However, if we just enable the native execution engine for acceleration, we can use the default starter pool.

We need to publish the environment before we can choose it in our PySpark notebook.

The image below shows the environment that has been selected.  Click execute cell arrow to start the job.

The job that reads the 1.5 Billon rows now takes 17 seconds.  It typically took around one hundred seconds to execute.  That means the native execution engine is five hundred percent faster.

The spark resources show a different pattern than the previous executions.  We have a choppy graph instead of a step function or sloping graph.

Experimental Findings

Today, we tried five different spark configurations while executing the same PySpark program.  The first three configurations have the same execution time.  Because the custom pool could not use a starter pool, the execution time was over 5 minutes due to cluster startup.  The environment with the native execution engine was the clear winner.

 

Configuration Execution Time (s) Max v-Cores

 

Comments

 

1 116 24 Standard session with bursting enabled
2 118 16 Standard session with bursting enabled
3 118 8 High concurrency session with bursting disabled
4 327 8 Custom pool with four small nodes
5 44 8 Native execution engine using standard session.

The image below is a paragraph taken from the MS learn website.  I chose to work with Parquet files since I knew the engine would read them quicker than the Scala JVM engine.  If I chose to work with CSV files, the engine would have fallen back to the standard JVM engine.  Any computational and/or intricate transformations will benefit from the new native execution engine.

The paragraph below was taken from the MS learn website.  We can turn on or off the engine at both the notebook and cell levels.  The code below configures the whole notebook to try to use the new engine.

We can enable or disable the engine for Spark SQL at the cell level.

Of course, we can do the same for PySpark.

There are a bunch of limitations and cautions that you should read before switching to the new engine.   Please see documentation for details.  To recap, the native execution engine had the best runtime using the least amount of resources.

Summary

As a Fabric data engineer, it is important to understand where changes can be made for Data Engineering using Spark.  The tenant level settings can be used to disable or enable starter pools.  Additionally, we can disable bursting.  Just remember, there is a queue length and retry logic that should be used in the data pipeline.  We can also force end users to use pools that administrators set up at the tenant level.  Just remember starter pools allow one to ten medium size nodes.  After that limit is exceeded, a normal spark cluster start time is two to five minutes.

The workspace settings allow administrators to create environments and custom pools.  Again, not using medium size nodes will result in a longer startup time.  On the other hand, if you are using high concurrency sessions, an extra-large cluster can be shared between multiple people.  Standard sessions use resources but can not be shared between notebooks and/or users.  One of the most important workspace settings is the cluster timeout.  Too short of a time results in restarting a session.  Too long of a time might result in an out-of-resource situation.  One can use an environment to apply custom libraries, spark settings, and/or drivers.  Just remember these environments cannot use a starter pool.

Finally, the native execution engine shows tremendous speed improvement with reading parquet and delta files.  We can control the use of native execute engines for notebooks via custom environments, notebook configurations, and/or cell settings.  One thing I leave the reader to explore is the use of the engine with complex calculations.  Here is a summary of limitations that you might encounter:  date filter type mismatches, decimal to float casting issues, inconsistent rounding behavior, time zone configuration error handling, skipping duplicate key check in mapping function, no support for UDFs,  no support for text related formats, and order variance / type mismatch for collect functions.  Please see the limitations section of on-line documentation for complete details.

In short, the native execution engine is pretty amazing.  It speeds up ingestion with the Velox enhancement and pushes JVM execution to natively compiled engine using Gluten.  For standard Spark processing that has an enormous amount of parquet files or complex processing, one might want to look at this engine to decrease the time your data engineering pipeline runs with the same amount of resources.

 

The post Going Native with Fabric Spark Pools: The Fabric Modern Data Platform appeared first on SQLServerCentral.

]]>
Getting Started with mssql-python for Data Developers https://www.sqlservercentral.com/articles/getting-started-with-mssql-python-for-data-developers Wed, 11 Mar 2026 00:00:54 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4736394 rogrammatic interaction with SQL Server or Azure SQL from a Python script is possible using a driver. A popular driver has been pyodbc that can be used standalone or with a SQLAlchemy wrapper. SQLAlchemy on its own is the Python SQL toolkit and Object Relational Mapper for developers. In the end of 2025 Microsoft released v1 of their own Python SQL driver called mssql-python. How do you get started using mssql-python for programmatic access to your SQL Server?

The post Getting Started with mssql-python for Data Developers appeared first on SQLServerCentral.

]]>
The post Getting Started with mssql-python for Data Developers appeared first on SQLServerCentral.

]]>
Designing Delta Tables with Liquid Clustering: Real-World Patterns for Data Engineers https://www.sqlservercentral.com/articles/designing-delta-tables-with-liquid-clustering-real-world-patterns-for-data-engineers Mon, 09 Mar 2026 00:00:07 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4726896 Get a short introduction to Liquid Clustering in Databricks.

The post Designing Delta Tables with Liquid Clustering: Real-World Patterns for Data Engineers appeared first on SQLServerCentral.

]]>
Introduction

If you’ve ever stared at a Spark UI wondering why a “simple” query is scanning half a terabyte of data, you already know the core problem Liquid Clustering is trying to solve. Most of the time, the engine doesn’t have a good way to avoid reading files that don’t matter to your query.

Partitioning and ZORDER help, but they’re brittle: you pick keys up front, workloads change, and you end up with either skewed partitions, way too many small files, or a pile of manual OPTIMIZE jobs to keep things afloat. Liquid Clustering flips that around. You tell Databricks which columns drive your queries (or let it pick them for you), and it keeps reorganizing files over time so that related rows live together on disk. That’s it. Better skipping, fewer files per query, less layout busywork.

What Liquid Clustering Actually Does

Liquid Clustering is a table-level layout strategy for Delta (and some managed Iceberg) tables. Instead of carving your table into physical folders like region=US/, region=EU/, it groups rows into logical clusters based on one or more columns and writes those clusters into files.

From your point of view as a data engineer:

  • Filters and joins on the clustering columns usually touch a much smaller subset of files.
  • Delta’s file stats (min/max, row count, etc.) become more meaningful, so data skipping actually kicks in.
  • The layout is stateful: the transaction log tracks where data lives, and the optimizer can keep improving it incrementally instead of rewriting the whole table on every change.

You don’t lose control—you still choose (or influence) the keys—but you offload the hard parts of file layout to the platform.

Why It’s Not Just Fancy Partitioning

Partitioning is a blunt instrument: you get one directory tree, and changing it later is painful. It works great for simple, single-dimension access patterns (for example, always filtering by event_date) but starts to fall apart when

  • Data is heavily skewed (one region or tenant dominates).
  • Users filter on different combinations of columns over time.
  • You need to add more partition columns and suddenly have thousands of tiny partitions to manage.

Liquid Clustering removes the folder boundary and gives the optimizer more freedom. You still pick a handful of important columns (or use auto), but Databricks decides how to group values into files and can change that as new data lands and query patterns shift. That means:

  • No directory explosion from high-cardinality partition columns.
  • Less “DDL regret” when workloads evolve.
  • Incremental layout fixes via OPTIMIZE instead of full table rebuilds.

In practice, you end up with fewer “oops, we picked the wrong partition key two years ago” conversations.

How It Works in Broad Strokes

Here’s how I’d explain the internals to another engineer on a whiteboard:

  • A clustering component looks at your chosen columns (or auto-selected keys) plus data distribution and query stats to figure out reasonable clusters.
  • A file layout manager turns those clusters into physical files, aiming for balanced file sizes and avoiding the classic small-file problem.
  • The Delta log keeps track of all of this, which lets the engine do incremental improvements instead of recomputing layout from scratch.
  • OPTIMIZE (and Predictive Optimization / automatic liquid clustering if enabled) runs re-clustering in the background so layout doesn’t silently rot as volume grows.

You don’t manage “partitions” anymore; you manage “is clustering enabled, on which columns, and are we keeping up with OPTIMIZE.”

Enabling Liquid Clustering: What You Actually Type

Before you flip the switch, as a data engineer, you should know three things up front:

  • Runtime support: check you’re on a runtime that supports liquid clustering and, if you care about auto, Predictive Optimization and CLUSTER BY AUTO.
  • Table type: this is for Delta / UC managed tables, not every random Parquet folder.
  • Workload: know your top WHERE/JOIN/GROUP BY columns for that table—those are your first clustering candidates.

If you can’t answer “who queries this table and how?”, you’ll mostly be guessing.

New Table Example

For a fresh table, you just add CLUSTER BY:

CREATE TABLE sales_data_clustered (
    transaction_id STRING,
    customer_id STRING,
    product_category STRING,
    region STRING,
    transaction_date DATE,
    amount DECIMAL(10,2),
    quantity INT
)
USING DELTA
CLUSTER BY (region, product_category);

Here, you’re optimizing for queries that slice by region and product category. Think “revenue by category per region” or “top products by region.” Those queries should hit fewer files once clustering has converged.

Converting an Existing Table

For an existing unpartitioned Delta table:

ALTER TABLE existing_sales_table
CLUSTER BY (region, product_category);

That sets the clustering definition going forward. It does not magically recluster all historical data in one shot. To actually see benefits, you:

  • Run OPTIMIZE existing_sales_table; on a schedule.
  • Optionally use OPTIMIZE FULL when you want to force a full re-cluster (more expensive, but sometimes worth it for key tables).

DataFrame / Spark API

The options you used are fine and easy for engineers to reason about:

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("LiquidClusteringDemo").getOrCreate()

data = [
    ("T001", "C001", "Electronics", "US",   "2024-01-15", 1500.00, 2),
    ("T002", "C002", "Clothing",    "EU",   "2024-01-16",  250.00, 3),
    ("T003", "C003", "Electronics", "ASIA", "2024-01-17",  800.00, 1),
    ("T004", "C004", "Books",       "US",   "2024-01-18",   45.00, 5),
    ("T005", "C005", "Electronics", "EU",   "2024-01-19", 2200.00, 1),
]

columns = ["transaction_id", "customer_id", "product_category",
           "region", "transaction_date", "amount", "quantity"]

df = spark.createDataFrame(data, columns)

(
    df.write
      .format("delta")
      .option("delta.clustering.enabled", "true")
      .option("delta.clustering.columns", "region,product_category")
      .mode("overwrite")
      .saveAsTable("sales_clustered_python")
)

From the job perspective, this is just a normal write, but the resulting files are laid out with clustering in mind.

Advanced Strategies: Multi-Column Clustering

Use multi-column clustering when your real queries almost never filter on a single column. For “customer analytics” style tables, that’s almost always the case.

CREATE TABLE customer_analytics (
    customer_id STRING,
    age_group STRING,
    income_bracket STRING,
    region STRING,
    signup_date DATE,
    lifetime_value DECIMAL(12,2)
)
USING DELTA
CLUSTER BY (region, age_group, income_bracket);

This is tuned for questions like “What does LTV look like for 25–34-year-olds in APAC across income brackets?” Clustering those three columns together means those cohorts live in fewer files, and your aggregations do less work.

Advanced Strategies: Time-Heavy Workloads

For time-series and IoT, you don’t have to cluster on time directly; often it’s better to cluster by who/what/where and let time ride along.

CREATE TABLE iot_sensor_data (
    sensor_id STRING,
    device_type STRING,
    location STRING,
    timestamp TIMESTAMP,
    temperature DOUBLE,
    humidity DOUBLE,
    pressure DOUBLE
)
USING DELTA
CLUSTER BY (location, device_type);

Most queries look like “temperature anomalies for temp sensors in Plant A over the last day.” Having those readings grouped by (location, device_type) means each query hits a tight set of files instead of rummaging through the entire lake for that plant.

How This Shows Up in Real Pipelines

For an E-Commerce warehouse, the typical patterns:

  • Daily/weekly revenue by region and category.
  • Customer value by region and category for a time window.

A clustered layout such as:

CLUSTER BY (region, category, order_date)

turns “read a giant heap of order rows and then filter” into “read the subset of files that actually contain US/EU + Electronics for that date range.” In Spark UI terms, your scan stage shows fewer files read, fewer input bytes, and lower task counts.

For an IoT Telemetry system, you set up something like:

CLUSTER BY (facility_id, sensor_type)

and then:

  • Your 24-hour rollups per facility/sensor type hit fewer files.
  • Your anomaly queries (“last 1 hour for FAC001 temp sensors”) stop being mini full-table scans.
  • You don’t need an explosion of timestamp partitions just to keep things fast.

For trading / finance, we might use:

CLUSTER BY (trade_date, sector, exchange)

You’re tightly aligning the layout with how risk and reporting teams actually fetch data—almost always by date + sector + where it traded. The practical upside: EOD/EOM jobs become much more predictable even as trade volume grows.

Performance: What You Actually See

Event-by-event, the engine is still Spark. The win comes from better skipping and saner file sizes:

  • Fewer files opened per query.
  • Less data read from storage for the same filters.
  • Fewer straggler tasks caused by badly skewed partitions.

On well-chosen keys, it’s very normal to see ~30–60% runtime reductions on heavy analytical queries versus an unclustered layout, and much higher improvements if you were previously doing near full-table scans. The exact number doesn’t matter as much as the pattern: scans become more proportional to the size of the slice you’re querying, not the entire table.

Monitoring and Day-2 Ops for Engineers

You don’t need a gigantic monitoring stack; you just need to know when clustering is drifting.

A few useful checks are:

  • DESCRIBE DETAIL table: watch numFiles and sizeInBytes to compute average file size.
  • SHOW TBLPROPERTIES table ('delta.clustering.columns'): confirm what the table thinks it’s clustered on.
  • Basic dashboards for file count, table size, and last OPTIMIZE run.

Your Python helper that flags “too many files or tiny files” is exactly the kind of pragmatic guardrail that fits into a simple notebook or job. If that check starts failing, you bump OPTIMIZE frequency, adjust clustering keys, or look at write patterns.

When to Skip Liquid Clustering

You don’t need Liquid Clustering everywhere. You can confidently skip it when:

  • The table is tiny (< 1 GB) and queries are fast anyway.
  • You have a pure write-heavy table where reads are rare or trivial; don’t slow ingests down just to get a theoretically nicer layout.
  • Workloads are basically “WHERE event_date = 'X' and nothing else.” Classic partitioning on that date is simple and works well.

In those cases, partitioning plus the occasional OPTIMIZE (or even no special layout at all) is perfectly fine.

How to Pick Clustering Columns

When you’re picking keys, think like this:

  • Start from queries, not from schema. Look at actual SQL: what’s in WHERE, JOIN ON, and GROUP BY?
  • Avoid fields with 2–3 distinct values (for example, booleans, coarse enums). They tend to cause skew.
  • Use IDs cautiously; they’re usually too unique unless your use case genuinely needs that granularity.
  • Keep it to 1–4 columns. More than that is rarely worth the mental and operational overhead.

If you already track basic column stats or have a helper to measure selectivity, that’s a nice way to add some data to your decision instead of going purely by gut feel.

Automation: How “Hands-Off” Can You Get?

On newer runtimes, you can combine Liquid Clustering with Predictive Optimization and automatic key selection (CLUSTER BY AUTO). That gives you:

  • Automatic choice and adjustment of clustering keys based on actual query history.
  • Background reclustering jobs that keep layout healthy without you scheduling everything manually.
  • A mostly “hands-off” experience for delta tables where you don’t want to argue about keys in every design review.
  • Liquid clustering can also be enabled while creating Materialized Views in Databricks.

Even then, most teams still keep a light monitor in place—if query performance or file counts drift, you’ll want to know and decide whether to override the auto choice.

About the Author

Mehul Bhuva is an Data and AI platform engineer and Microsoft Azure Developer Influencer with over 20 years of experience building and modernizing data-intensive systems on Microsoft Azure, including Databricks and Delta Lake. He works end-to-end across data pipelines, analytics, and applications, using tools such as Azure Data Factory, Azure Databricks, Azure Functions, Power BI, C#.NET, Blazor, Angular, PowerShell, and SQL Server. Through his blog, SharePointFix.com, he focuses on sharing practical, reproducible patterns that data engineers can drop directly into real projects without unnecessary complexity.

LinkedIn Profile: https://www.linkedin.com/in/mehulbhuva/

The post Designing Delta Tables with Liquid Clustering: Real-World Patterns for Data Engineers appeared first on SQLServerCentral.

]]>
Fun with JSON https://www.sqlservercentral.com/articles/fun-with-json Wed, 18 Feb 2026 00:00:55 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4719165 Last year, I used a lot of JSON to exchange data between systems. There are several ways to extract data from a JSON file, but there is one specific, probably less-used possibility that I’d like to highlight. For one project, I received JSON files containing a variable number of parameters and their values. If I […]

The post Fun with JSON appeared first on SQLServerCentral.

]]>
Last year, I used a lot of JSON to exchange data between systems. There are several ways to extract data from a JSON file, but there is one specific, probably less-used possibility that I’d like to highlight. For one project, I received JSON files containing a variable number of parameters and their values. If I would have known all the parameter names, I could have used a more common way to extract the data, but that was not the case.

Now JSON contains key-value pairs. In SQL Server, you have the ability to get these key and value pairs with the OPENJSON table-valued function. This gives you a result table with the keys, the values, and the type of value (see OPENJSON (Transact-SQL) - SQL Server | Microsoft Learn. With this solution in mind, I got excited about other possibilities.

Let’s start with an example. First we need to create a table and insert a few rows.

-- Step 1: Create table with test data 
create table #test_data 
    (
    id int primary key,
    name varchar(100),
    birth_date date
    )

-- Step 2: Insert rows  
insert into #test_data
values(1, 'Olivia', '2025-01-05'),
      (2, 'Emma', '2025-03-02'),
      (3, 'Liam', '2025-11-15'),
      (4, 'Noah', '2025-12-22')

Now we are able to select the rows in the table as JSON output with the FOR JSON PATH clause.

-- Step 3: Create JSON for table data
select t.* from #test_data as t for json path 

The result is a JSON containing all rows as an array of JSON objects (see Format Query Results as JSON with FOR JSON - SQL Server | Microsoft Learn)

In the next step we’re going to split the result JSON output from previous step, with the OPENJSON table valued function.

-- Step 4: Select from JSON 
select * from openjson((select t.* from #test_data as t for json path)) t

The result is a key-value-pair for each array element in the JSON array. The key is the element number in the array, the value is a JSON (type=5) from the row.

Now we are going to split the JSON values again to key-value pairs, but we want to keep the element number of the original row.

-- Step 5: Break down JSON to a single row for each column
select t1."key" as row, t2.* 
    from openjson((select t.* from #test_data as t for json path)) t1
    cross apply openjson(t1.value) t2

The result shows the element as row and a key-value-pair for each column.

With a little more effort, you could select the original key instead of the element number. To prevent the column is added twice (id and key_column), the WHERE-clause is used to omit the key column in the result.

-- Step 6: Break down JSON to a single row for each column and use key instead of row
select 
   json_value(t1.value, '$.key_column') as key_column,
   t2.* 
   from openjson((select t.id as key_column, t.* from #test_data as t for json path)) t1
   cross apply openjson(t1.value) t2
   where t2."key" <> 'key_column'

Now the result shows the key column instead of the row.

The last step is to put the JSON, for example for one selected row, into a variable. This gives us the possibility to execute a function or procedure with a JSON variable as parameter.

-- Step 7: Use variable with JSON 
declare @variable nvarchar(max) = (select t.* from #test_data as t where t.id = 1 for json path)
select t1."key", t2.* 
    from openjson(@variable) t1
    cross apply openjson(t1.value) t2

The result is the key-value-pair for each column of the selected row.

Now we are able to split a JSON In a row for each column, we could also use this to create a CSV or to replace multiple values in a text.

JSON to CSV

We regularly get the question to export the result of a query to a file. The first question is always: “What are you going to do with the result?”. But, often an export is needed.

Now we are able to break down a JSON to a row for each column, we could also reassemble it to a comma separated file. The example query from step 5 already gives us all the information we need.

We are going to create a function JSON_to_CSV with the following parameters:

  1. JSON parameter
  2. First row is column heading
  3. Column separator
  4. Line feed

Create the function below.

-- Step 8: Create and execute function JSON to CSV 
create or alter function "json_to_csv"
(
   @json_parameter             nvarchar(max),
   @first_row_column_heading   int = 1,
   @column_separator           char(1) = ';',
   @line_feed                  char(1)        
)
returns nvarchar(max)
as
begin
    -- Transform JSON to CSV. For the JSON parameter use a query like "select * from table for json path, include_null_values")
    return(select string_agg(csv.row_value, @line_feed) -- Line feed
             from (select top 1  -- Column heading
                      (select string_agg(t22.value, @column_separator) 
                            from(select quotename(t2."key", '"') as value
                                    from openjson(t1.value) t2
                                    where t2.type in (0, 1, 2, 3)) t22) as row_value  -- Check types to use.
                         from openjson(@json_parameter) t1
                         where @first_row_column_heading = 1 
                       union all 
                       select 
                         (select string_agg(t22.value, @column_separator) from(select  -- Column separator. Text in double quotes.
                                           case when t2.type = 0 then 'NULL'
                                                when t2.type = 1 then quotename(t2.value, '"')
                                                when t2.type in (2, 3) then t2.value
                                                end as value
                                           from openjson(t1.value) t2) t22) as row_value    
                            from openjson(@json_parameter) t1
                         ) csv
       )
end
GO 

We can test the function with the query below. Be sure to add INCLUDE_NULL_VALUES when selecting rows as JSON. Otherwise the columns with NULL-values are not present in the result.

-- Step 9: Execute JSON to CSV
select dbo.json_to_csv((select * from #test_data for json path, include_null_values), 1, ';', char(10)) as csv

The result is one long string with separators for columns and rows. If we set the result output in the SSMS to text, we get the result below.

Note: If the values in the JSON could contain the column separator or the line feed character, the result could be unpredictable. You first have to replace these characters or use a different characters.

Merge text

Recently I had to replace multiple values in a text. I wanted to create a generic function that could replace multiple values at once. This function would have 2 parameters:

  1. The text with the variable names. The variable names are placed between brackets (e.g. [NAME]).
  2. A list of variable names and values.

For the second parameter I used a JSON string.

To be sure that I only process variables that are used in the text, I added a where clause to check if the variable exists. I didn’t want to use a cursor to loop through the variable names. To prevent an infinite loop. I added a maximum (100). For example, when the value of a variable also contains a variable name with brackets.

-- Step 10: Create function Merge text
create or alter function "merge_text"
(
   @source_text   nvarchar(max),
   @json_parms    nvarchar(max)
)
returns nvarchar(max)
as
begin

    declare @loop_count int,
            @result_text nvarchar(max) = @source_text 
    
    if isjson(@json_parms) = 1 
    begin 
        select @loop_count = 1  -- sets @@rowcount for first loop
        while @@ROWCOUNT = 1  
          and @loop_count < 100
        begin 
            select top 1
                @loop_count += 1, 
                @result_text = replace(@result_text, t1.quote_name_key, t1.[value])
                from (select 
                         quotename(t2."key") as quote_name_key,
                         t2.[value],
                         t2.[type]
                         from openjson(@json_parms) t1
                         cross apply openjson(t1.value) t2 
                         where charindex(quotename(t2.key), @result_text) > 0  -- Value used 
                           and quotename(t2.key) <> t1.value
                           and t2.value is not null
                         ) t1 
        end 
    end 
    return @result_text
end
GO

You can test the function with the example @variable value from the #test_data above.

-- Step 11 Execute merge text 
declare @text nvarchar(max) = N'Hello [name], I want to merge your record with ID=[id], name=[name] and birthdate [birth_date] into the text.',
        @variable nvarchar(max) = (select t.* from #test_data as t where t.id = 1 for json path)

select dbo.merge_text(@text, @variable) as merged_text

Final words

This is what I wanted to share. Thanks Erland Sommarskog for your tips!

The last step cleans up the created objects.

-- Step 12: Clean-up
drop table if exists #test_data
drop function if exists dbo.json_to_csv
drop function if exists dbo.merge_text

You can find the complete SQL script

Have fun with JSON!

The post Fun with JSON appeared first on SQLServerCentral.

]]>
SQL Server Long-Running Query Detection: How to Find and Monitor Slow Queries https://www.sqlservercentral.com/articles/sql-server-long-running-query-detection-how-to-find-and-monitor-slow-queries Mon, 09 Mar 2026 00:00:28 +0000 https://www.sqlservercentral.com/?post_type=ssc_article&p=4736385 Monitoring and alerting can be overwhelming tasks, especially for those new to the world of managing production data. One common challenge for any data professional is the identification and management of queries that run for longer than they should.

The post SQL Server Long-Running Query Detection: How to Find and Monitor Slow Queries appeared first on SQLServerCentral.

]]>
The post SQL Server Long-Running Query Detection: How to Find and Monitor Slow Queries appeared first on SQLServerCentral.

]]>