This document contains a set of SQL queries run with duckdb to generate a managable subset of the spotify metadata available here: https://www.kaggle.com/datasets/lordpatil/spotify-metadata-by-annas-archive
curl https://install.duckdb.org | sh add path as suggested duckdb
COPY (
SELECT *
FROM 'tracks.parquet'
WHERE popularity > 50
) TO 'filtered_tracks.parquet' (FORMAT PARQUET);COPY (
SELECT *
FROM 'playlists.parquet'
WHERE followers_total > 300000
) TO 'playlists_follow_300kplus.parquet' (FORMAT PARQUET);COPY (
SELECT playlist_tracks.*
FROM 'playlist_tracks.parquet' playlist_tracks
INNER JOIN 'playlists_follow_300kplus.parquet' playlists
ON playlist_tracks.playlist_rowid = playlists.rowid
) TO 'playlist_tracks_matching.parquet' (FORMAT PARQUET);COPY (
SELECT track_artists.*
FROM 'track_artists.parquet' track_artists
INNER JOIN 'filtered_tracks.parquet' filtered_tracks
ON track_artists.track_rowid = filtered_tracks.rowid
) TO 'tracks_artists_matching.parquet' (FORMAT PARQUET);COPY (
SELECT artists.*
FROM 'artists.parquet' artists
INNER JOIN 'tracks_artists_matching.parquet' track_artists
ON artists.rowid = track_artists.artist_rowid
GROUP BY ALL
) TO 'artists_matching.parquet' (FORMAT PARQUET);COPY (
SELECT albums.*
FROM 'filtered_tracks.parquet' filtered_tracks
INNER JOIN 'albums.parquet' albums
ON albums.rowid = filtered_tracks.album_rowid
GROUP BY ALL
) TO 'albums_matching.parquet' (FORMAT PARQUET);COPY (
SELECT album_images.*
FROM 'albums_matching.parquet' albums_matching
INNER JOIN 'album_images.parquet' album_images
ON album_images.album_rowid = albums_matching.rowid
WHERE width = 64
GROUP BY ALL
) TO 'album_images_matching.parquet' (FORMAT PARQUET);Install is run once
INSTALL sqlite;
LOAD sqlite;
ATTACH 'spotify_meta.db' AS sqlite_db (TYPE SQLITE);
CREATE TABLE sqlite_db.tracks AS SELECT * from 'filtered_tracks.parquet';
CREATE TABLE sqlite_db.playlists AS SELECT * from 'playlists_follow_300kplus.parquet';
CREATE TABLE sqlite_db.playlist_tracks AS SELECT * from 'playlist_tracks_matching.parquet';
CREATE TABLE sqlite_db.track_artists AS SELECT * from 'tracks_artists_matching.parquet';
CREATE TABLE sqlite_db.artists AS SELECT * from 'artists_matching.parquet';
CREATE TABLE sqlite_db.albums AS SELECT * from 'albums_matching.parquet';
CREATE TABLE sqlite_db.album_images AS SELECT * from 'album_images_matching.parquet';USE sqlite_db;
CREATE INDEX `track_artists_trackrowid` ON `track_artists` (`track_rowid`);
CREATE INDEX `playlist_tracks_playlistrowid` ON `playlist_tracks` (`playlist_rowid`);
CREATE INDEX `tracks_rowid` ON `tracks` (`rowid`);
CREATE INDEX `artists_rowid` ON `artists` (`rowid`);
CREATE INDEX `albums_rowid` ON `albums` (`rowid`);
CREATE INDEX `album_images_albumrowid` ON `album_images` (`album_rowid`);