hackernews-button/.github/workflows/generate-bloomfilter.yml at master · jstrieb/hackernews-button

200 lines (174 loc) · 6.66 KB
# Generate Bloom filters with submitted Hacker News stories periodically and
# on-command for use in the accompanying browser extension.
# Created by Jacob Strieb
# January 2021
name: Generate Bloom Filters
# Run once every 24 hours because the BigQuery database is updated once every
# 24 hours. Also run when manually triggered.
  schedule:
    # This time is based loosely around when the BigQuery data updates
    - cron: "5 16 * * *"
  workflow_dispatch:
  create-bloomfilter:
    name: Create Bloom Filters
    runs-on: ubuntu-latest
      - name: Install dependencies
        run: |
          sudo apt-get install jo
      - uses: actions/checkout@v2
      - name: Set up BigQuery
        run: |
          # This BQ_JSON key is obtained by downloading the JSON key for a
          # service account on BigQuery. It is then copied directly into a
          # GitHub Actions secret (under Settings)
          echo '${{ secrets.BQ_JSON }}' > bq.json
          gcloud auth \
            activate-service-account \
            --key-file bq.json
          # Unfortunately, bq outputs some dumb shit the first time we run it
          # with the new authentication, so we do a dummy call here to prevent
          # it from outputting the annoying welcome message directly into the
          # CSV we are attempting to process
          bq query \
            --project_id=hacker-news-analysis-300719 \
            --use_legacy_sql=false \
            'SELECT COUNT(*) FROM `bigquery-public-data.hacker_news.full`
             WHERE type = "story"'
      - name: Compile Bloom filter creation program
        run: |
          make create
      - name: Pull BigQuery data
        run: |
          # Note that max_rows must be an unsigned, 32-bit int
          bq query \
            --project_id=hacker-news-analysis-300719 \
            --format csv \
            --max_rows 99999999 \
            --use_legacy_sql=false \
            'SELECT
              url,
              score,
              descendants,
              time
             FROM
               `bigquery-public-data.hacker_news.full`
             WHERE
               type = "story"
               AND dead IS NOT TRUE
               AND deleted IS NOT TRUE
               AND url != ""' \
            > data.csv
          head -n 50 data.csv
          # Put the data in a sqlite3 database
          sqlite3 \
            data.db \
            ".mode csv" \
            ".import data.csv hn"
      - name: Generate Bloom Filters
        run: |
          # Get the time of the last submission
          LATEST=$(sqlite3 \
             -csv \
             data.db \
             "SELECT CAST(time AS INT) AS inttime
              FROM hn ORDER BY inttime DESC LIMIT 1")
          # TODO: Make this accurate -- not sure why but it's inaccurate now
          printf \
            "Last story submitted %d hours ago\n" \
            $((($(date +%s) - LATEST) / 3600))
          # Generate one Bloom filter for each date range/threshold combination
          # Used as sqlite3 date modifiers:
          # https://sqlite.org/lang_datefunc.html
          DATE_RANGES=(
            # "-1 month"
            "-7 days"
            "-24 hours"
          THRESHOLDS=(
          # Apply the threshold to points (score) or comments (descendants)
          THRESHOLD_KEY="score"
          mkdir generated
          for THRESHOLD in "${THRESHOLDS[@]}"; do
            FILENAME="hn-$THRESHOLD.bloom"
            # Make a Bloom filter for each threshold with no time restriction
            echo "Creating bloom filter for stories from all time and" \
              "$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."
            sqlite3 \
              -header \
              -csv \
              data.db \
              "SELECT * FROM hn
               WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD" \
              | python3 canonicalize.py \
              | bin/bloom-create "generated/$FILENAME"
            # For the current threshold, make a bloom filter for each date range
            for DATE_RANGE in "${DATE_RANGES[@]}"; do
              DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')"
              FILENAME="hn-$DATE_RANGE_STR-$THRESHOLD.bloom"
              echo "Creating bloom filter for $DATE_RANGE and" \
                "$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."
              sqlite3 \
                -header \
                -csv \
                data.db \
                "SELECT * FROM hn
                 WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD
                 AND CAST(time AS INT) > strftime('%s', (
                   SELECT CAST(time AS INT) AS inttime FROM hn
                   ORDER BY inttime DESC LIMIT 1
                 ), 'unixepoch', '$DATE_RANGE')" \
                | python3 canonicalize.py \
                | bin/bloom-create "generated/$FILENAME"
            done
          done
          # Output a JSON file with information about the thresholds and dates
          # TODO: Fix to use $LATEST instead of now
          DATES=()
          for DATE_RANGE in "${DATE_RANGES[@]}"; do
            DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')"
            DATE="$(date -d "now ""$DATE_RANGE" +%s)"
            DATES+=("$DATE=$DATE_RANGE_STR")
          done
          # 24 hours used in next_generated since this runs every 24 hours and
          # takes 10 to 20 minutes to complete
          jo \
            thresholds=$(jo -a "${THRESHOLDS[@]}") \
            dates=$(jo "${DATES[@]}") \
            version="0.6" \
            compressed="true" \
            date_generated="$(date +%s)" \
            next_generated="$(date -d "now +24 hours +20 minutes" +%s)" \
            last_submitted="$(sqlite3 \
               -csv \
               data.db \
               "SELECT CAST(time AS INT) AS inttime
                FROM hn ORDER BY inttime DESC LIMIT 1")" \
            > generated/info.json
      - name: Create Release
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          cd generated
          # Keep the latest extension in the next release
          curl \
            --location \
            --output "hackernews-button.xpi" \
            "https://github.com/jstrieb/hackernews-button/releases/latest/download/hackernews-button.xpi"
          # TODO: Fix fatal name collision failure if run more than once a day?
          gh release create \
            "$(date +%Y%m%d)" \
            --title "$(date '+%b %d, %Y') Bloom filters" \
            --notes "Updated Bloom filters for Hacker News submitted stories for $(date '+%B %d, %^Y')" \
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

generate-bloomfilter.yml

Latest commit

History

generate-bloomfilter.yml

File metadata and controls