-
Notifications
You must be signed in to change notification settings - Fork 1
200 lines (174 loc) · 6.66 KB
/
generate-bloomfilter.yml
File metadata and controls
200 lines (174 loc) · 6.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# Generate Bloom filters with submitted Hacker News stories periodically and
# on-command for use in the accompanying browser extension.
#
# Created by Jacob Strieb
# January 2021
name: Generate Bloom Filters
# Run once every 24 hours because the BigQuery database is updated once every
# 24 hours. Also run when manually triggered.
on:
schedule:
# This time is based loosely around when the BigQuery data updates
- cron: "5 16 * * *"
workflow_dispatch:
jobs:
create-bloomfilter:
name: Create Bloom Filters
runs-on: ubuntu-latest
steps:
- name: Install dependencies
run: |
sudo apt-get install jo
- uses: actions/checkout@v2
- name: Set up BigQuery
run: |
# This BQ_JSON key is obtained by downloading the JSON key for a
# service account on BigQuery. It is then copied directly into a
# GitHub Actions secret (under Settings)
echo '${{ secrets.BQ_JSON }}' > bq.json
gcloud auth \
activate-service-account \
--key-file bq.json
# Unfortunately, bq outputs some dumb shit the first time we run it
# with the new authentication, so we do a dummy call here to prevent
# it from outputting the annoying welcome message directly into the
# CSV we are attempting to process
bq query \
--project_id=hacker-news-analysis-300719 \
--use_legacy_sql=false \
'SELECT COUNT(*) FROM `bigquery-public-data.hacker_news.full`
WHERE type = "story"'
- name: Compile Bloom filter creation program
run: |
make create
- name: Pull BigQuery data
run: |
# Note that max_rows must be an unsigned, 32-bit int
bq query \
--project_id=hacker-news-analysis-300719 \
--format csv \
--max_rows 99999999 \
--use_legacy_sql=false \
'SELECT
url,
score,
descendants,
time
FROM
`bigquery-public-data.hacker_news.full`
WHERE
type = "story"
AND dead IS NOT TRUE
AND deleted IS NOT TRUE
AND url != ""' \
> data.csv
head -n 50 data.csv
# Put the data in a sqlite3 database
sqlite3 \
data.db \
".mode csv" \
".import data.csv hn"
- name: Generate Bloom Filters
run: |
# Get the time of the last submission
LATEST=$(sqlite3 \
-csv \
data.db \
"SELECT CAST(time AS INT) AS inttime
FROM hn ORDER BY inttime DESC LIMIT 1")
# TODO: Make this accurate -- not sure why but it's inaccurate now
printf \
"Last story submitted %d hours ago\n" \
$((($(date +%s) - LATEST) / 3600))
# Generate one Bloom filter for each date range/threshold combination
# Used as sqlite3 date modifiers:
# https://sqlite.org/lang_datefunc.html
DATE_RANGES=(
# "-1 month"
"-7 days"
"-24 hours"
)
THRESHOLDS=(
0
10
75
250
500
)
# Apply the threshold to points (score) or comments (descendants)
THRESHOLD_KEY="score"
mkdir generated
for THRESHOLD in "${THRESHOLDS[@]}"; do
FILENAME="hn-$THRESHOLD.bloom"
# Make a Bloom filter for each threshold with no time restriction
echo "Creating bloom filter for stories from all time and" \
"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."
sqlite3 \
-header \
-csv \
data.db \
"SELECT * FROM hn
WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD" \
| python3 canonicalize.py \
| bin/bloom-create "generated/$FILENAME"
# For the current threshold, make a bloom filter for each date range
for DATE_RANGE in "${DATE_RANGES[@]}"; do
DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')"
FILENAME="hn-$DATE_RANGE_STR-$THRESHOLD.bloom"
echo "Creating bloom filter for $DATE_RANGE and" \
"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."
sqlite3 \
-header \
-csv \
data.db \
"SELECT * FROM hn
WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD
AND CAST(time AS INT) > strftime('%s', (
SELECT CAST(time AS INT) AS inttime FROM hn
ORDER BY inttime DESC LIMIT 1
), 'unixepoch', '$DATE_RANGE')" \
| python3 canonicalize.py \
| bin/bloom-create "generated/$FILENAME"
done
done
# Output a JSON file with information about the thresholds and dates
#
# TODO: Fix to use $LATEST instead of now
DATES=()
for DATE_RANGE in "${DATE_RANGES[@]}"; do
DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')"
DATE="$(date -d "now ""$DATE_RANGE" +%s)"
DATES+=("$DATE=$DATE_RANGE_STR")
done
# 24 hours used in next_generated since this runs every 24 hours and
# takes 10 to 20 minutes to complete
jo \
-p \
thresholds=$(jo -a "${THRESHOLDS[@]}") \
dates=$(jo "${DATES[@]}") \
version="0.6" \
compressed="true" \
date_generated="$(date +%s)" \
next_generated="$(date -d "now +24 hours +20 minutes" +%s)" \
last_submitted="$(sqlite3 \
-csv \
data.db \
"SELECT CAST(time AS INT) AS inttime
FROM hn ORDER BY inttime DESC LIMIT 1")" \
> generated/info.json
- name: Create Release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
cd generated
# Keep the latest extension in the next release
curl \
--location \
--output "hackernews-button.xpi" \
"https://github.com/jstrieb/hackernews-button/releases/latest/download/hackernews-button.xpi"
# TODO: Fix fatal name collision failure if run more than once a day?
gh release create \
"$(date +%Y%m%d)" \
--title "$(date '+%b %d, %Y') Bloom filters" \
--notes "Updated Bloom filters for Hacker News submitted stories for $(date '+%B %d, %^Y')" \
*