Categories: geek » development » js

RSS - Atom - Subscribe via email

Emacs Lisp and NodeJS: Getting the bolded words from a section of a Google Document

Posted: - Modified: | french, js, emacs
  • : Cleaned up links from Google
  • : Simplified getting a section or finding the bolded text by using the Org Mode format instead.

During the sessions with my French tutor, I share a Google document so that we can mark the words where I need to practice my pronunciation some more or tweak the wording. Using Ctrl+B to make the word as bold is an easy way to make it jump out.

I used to copy these changes into my Org Mode notes manually, but today I thought I'd try automating some of it.

First, I need a script to download the HTML for a specified Google document. This is probably easier to do with the NodeJS library rather than with oauth2.el and url-retrieve-synchronously because of various authentication things.

require('dotenv').config();
const { google } = require('googleapis');

async function download(fileId) {
  const auth = new google.auth.GoogleAuth({
    scopes: ['https://www.googleapis.com/auth/drive.readonly'],
  });
  const drive = google.drive({ version: 'v3', auth });
  const htmlRes = await drive.files.export({
    fileId: fileId,
    mimeType: 'text/html'
  });
  return htmlRes.data;
}

async function main() {
  console.log(await download(process.argv.length > 2 ? process.argv[2] : process.env['DOC_ID']));
}

main();

Then I can wrap a little bit of Emacs Lisp around it.

(defvar my-google-doc-download-command
  (list "nodejs" (expand-file-name "~/bin/download-google-doc-html.cjs")))

(defun my-google-doc-html (doc-id)
  (when (string-match "https://docs\\.google\\.com/document/d/\\(.+?\\)/" doc-id)
    (setq doc-id (match-string 1 doc-id)))
  (with-temp-buffer
    (apply #'call-process (car my-google-doc-download-command)
           nil t nil (append (cdr my-google-doc-download-command) (list doc-id)))
    (buffer-string)))

(defun my-google-doc-clean-html (html)
  "Remove links on spaces, replace Google links."
  (let ((dom (with-temp-buffer
               (insert html)
               (libxml-parse-html-region))))
    (dom-search
     dom
     (lambda (o)
       (when (eq (dom-tag o) 'a)
         (when (and (dom-attr o 'href)
                    (string-match "https://\\(www\\.\\)?google\\.com/url\\?q=" (dom-attr o 'href)))
           (let* ((parsed (url-path-and-query
                           (url-generic-parse-url (dom-attr o 'href))))
                  (params (url-parse-query-string (cdr parsed))))
             (dom-set-attribute o 'href (car (assoc-default "q" params #'string=)))))
         (let ((text (string= (string-trim (dom-text o)) "")))
           (when (string= text "")
             (setf (car o) 'span))))
       (when (and
              (string-match "font-weight:700" (or (dom-attr o 'style) ""))
              (not (string-match "font-style:normal" (or (dom-attr o 'style) ""))))
         (setf (car o) 'strong))
       (when (dom-attr o 'style)
         (dom-remove-attribute o 'style))))
    ;; bold text is actually represented as font-weight:700 instead
    (with-temp-buffer
      (svg-print dom)
      (buffer-string))))

(defun my-google-doc-org (doc-id)
  "Return DOC-ID in Org Mode format."
  (pandoc-convert-stdio (my-google-doc-clean-html (my-google-doc-html doc-id)) "html" "org"))

I have lots of sections in that document, including past journal entries, so I want to get a specific section by name.

(defun my-org-get-subtree-by-name (org-text heading-name)
  "Return ORG-TEXT subtree for HEADING-NAME."
  (with-temp-buffer
    (insert org-text)
    (org-mode)
    (goto-char (point-min))
    (let ((org-trust-scanner-tags t))
      (car (delq nil
                 (org-map-entries
                  (lambda ()
                    (when (string= (org-entry-get (point) "ITEM") heading-name)
                      (buffer-substring (point) (org-end-of-subtree))))))))))

Now I can get the bolded words from a section of my notes, with just a sentence for context. I use pandoc to convert it to Org Mode syntax.

(defvar my-lang-words-for-review-context-function 'sentence-at-point)
(defvar my-lang-tutor-notes-url nil)
(defun my-lang-tutor-notes (section-name)
  (my-org-get-subtree-by-name
   (my-google-doc-org my-lang-tutor-notes-url)
   section-name))

(defun my-lang-words-for-review (section)
  "List the bolded words for review in SECTION."
  (let* ((section (my-lang-tutor-notes section))
         results)
    (with-temp-buffer
      (insert section)
      (org-mode)
      (goto-char (point-min))
      (org-map-entries
       (lambda ()
         (org-end-of-meta-data t)
         (while (re-search-forward "\\*[^* ].*?\\*" nil t)
           (cl-pushnew
            (replace-regexp-in-string
             "[ \n ]+" " "
             (funcall my-lang-words-for-review-context-function))
            results
            :test 'string=)))))
    (nreverse results)))

For example, when I run it on my notes on artificial intelligence, this is the list of bolded words and the sentences that contain them.

(my-lang-words-for-review "Sur l'intelligence artificielle")
  • Je l'ai aussi utilisée pour faire des recherches.
  • Je peux consacrer une petite partie de mon budget à des essais, mais je ne veux pas travailler davantage pour rentabiliser une dépense plus importante.
  • Je n'ai pas le temps de concentration nécessaire pour justifier l'investissement dans mon propre matériel, et sinon, les progrès sont trop rapides pour m'engager dans une configuration spécifique.
  • J'ai une conscience aiguë des limites cognitives ou physiques à cause des difficultés de santé de ma mère et de ma sœur, et de mes expériences avec mes limitations à cause du fait que je suis la personne principalement en charge de ma fille.
  • Je lis très vite, mais je n'ai pas assez de patience pour les longs contenus vidéo ou audio.
  • Je n'aime pas les textes qui contiennent beaucoup de remplissage.
  • Beaucoup de gens ont une réaction forte contre l'IA pour plusieurs raisons qui incluent le battage médiatique excessif dont elle fait l'objet, son utilisation à mauvais escient, et l'inondation de banalité qu'elle produit.
  • Je réécris souvent la majorité du logiciel à l'exception d'un ou deux morceaux parce que ce code ne me convient pas.
  • Je ne veux pas l'utiliser pour les correctifs que je veux soumettre à d'autres projets parce que le code ne me semble pas correct et je ne veux pas gaspiller le temps d'autres bénévoles.
  • J'aime pouvoir lui donner trois dépôts git et des instructions pour générer un logiciel à partir d'un dépôt pour un autre via le troisième dépôt.
  • Mais je ne veux pas le publier avant de réécrire et tout comprendre.
  • Sans l'IA, je pourrais peut-être apprendre plus lentement avec l'aide d'Internet, qui a beaucoup de ressources commehttps://vitrinelinguistique.oqlf.gouv.qc.ca/Vitrine linguistique.
  • Je veux profiter davantage, apprendre davantage avec l'aide de vraies personnes, complétée par l'aide de l'IA.
  • J'adore les sous-titres simultanés, mais je n'ai pas toujours trouvé une méthode ou un système qui me convienne.

I can then go into the WhisperX transcription JSON file and replay those parts for closer review.

I can also tweak the context function to give me less information. For example, to limit it to the containing phrase, I can do this:

(defun my-split-string-keep-delimiters (string delimiter)
  (when string
    (let (results pos)
      (with-temp-buffer
        (insert string)
        (goto-char (point-min))
        (setq pos (point-min))
        (while (re-search-forward delimiter nil t)
          (push (buffer-substring pos (match-beginning 0)) results)
          (setq pos (match-beginning 0)))
        (push (buffer-substring pos (point-max)) results)
        (nreverse results)))))

(ert-deftest my-split-string-keep-delimiters ()
 (should
  (equal (my-split-string-keep-delimiters
          "Beaucoup de gens ont une réaction forte contre l'IA pour plusieurs raisons qui *incluent* le battage médiatique excessif dont elle fait l'objet, son utilisation à mauvais escient, et *l'inondation de banalité* qu'elle produit."
          ", \\| que \\| qui \\| qu'ils? \\| qu'elles? \\| qu'on "
          )
 )))

(defun my-lang-words-for-review-phrase-context (&optional s)
  (setq s (replace-regexp-in-string " " " " (or s (sentence-at-point))))
  (string-join
   (seq-filter (lambda (s) (string-match "\\*" s))
               (my-split-string-keep-delimiters s ", \\| parce que \\| que \\| qui \\| qu'ils? \\| qu'elles? \\| qu'on \\| pour "))
   " ... "))

(ert-deftest my-lang-words-for-review-phrase-context ()
  (should
   (equal (my-lang-words-for-review-phrase-context
           "Je peux consacrer une petite partie de mon *budget* à des essais, mais je ne veux pas travailler davantage pour rentabiliser une dépense plus importante.")
          "Je peux consacrer une petite partie de mon *budget* à des essais")))
(let ((my-lang-words-for-review-context-function 'my-lang-words-for-review-phrase-context))
  (my-lang-words-for-review "Sur l'intelligence artificielle"))
  • pour faire des recherches.
  • Je peux consacrer une petite partie de mon budget à des essais
  • , et sinon
  • J'ai une conscience aiguë des limites cognitives ou physiques à cause des difficultés de santé de ma mère et de ma sœur
  • pour les longs contenus vidéo ou audio.
  • Je n'aime pas les textes qui contiennent beaucoup de remplissage.
  • qui incluent le battage médiatique excessif dont elle fait l'objet … , et l'inondation de banalité
  • Je réécris souvent la majorité du logiciel à l'exception d'un ou deux morceaux
  • pour les correctifs … parce que le code ne me semble pas correct et je ne veux pas gaspiller le temps d'autres bénévoles.
  • pour un autre via le troisième dépôt.
  • Mais je ne veux pas le publier avant de réécrire et tout comprendre.
  • , je pourrais peut-être apprendre plus lentement avec l'aide d'Internet
  • , apprendre davantage avec l'aide de vraies personnes, complétée par l'aide de l'IA.
  • qui me convienne.

Now that I have a function for retrieving the HTML or Org Mode for a section, I can use that to wdiff against my current text to more easily spot wording changes.

(defun my-lang-tutor-notes-wdiff-org ()
  (interactive)
  (let ((section (org-entry-get (point) "ITEM")))
    (my-wdiff-strings
     (replace-regexp-in-string
      " " " "
      (my-org-subtree-text-without-blocks))
     (replace-regexp-in-string
      " " " "
      (my-lang-tutor-notes section)))))

Related:

Screenshot:

2026-03-12_11-28-24.png
Figure 1: wdiff
This is part of my Emacs configuration.
View Org source for this post

On this day

| 11ty, js

Nudged by org-daily-reflection (@emacsomancer's toot) and Jeremy Keith's post where he mentions his on this day page, I finally got around to making my own on this day page again. I use the 11ty static site generator, so it's static unless you have Javascript enabled. It might be good for bumping into things. I used to have an "On this day" widget back when I used Wordpress, which was fun to look at occasionally.

The code might be a little adamant about converting all the dates to America/Toronto:

11ty code for posts on this day
export default class OnThisDay {
  data() {
    return {
      layout: 'layouts/base',
      permalink: '/blog/on-this-day/',
      title: 'On this day'
    };
  }

  async render(data) {
    const today = new Date(new Date().toLocaleString('en-US', { timeZone: 'America/Toronto' }));
    const options = { month: 'long', day: 'numeric' };
    const date = today.toLocaleDateString('en-US', options);
    const currentMonthDay = today.toISOString().substring(5, 10);
    let list = data.collections._posts
        .filter(post => {
          const postDateTime = new Date(post.date).toLocaleString('en-US', { timeZone: 'America/Toronto' });
          const postMonthDay = (new Date(postDateTime)).toISOString().substring(5, 10);
          return postMonthDay === currentMonthDay;
        })
        .sort((a, b) => {
          if (a.date < b.date) return 1;
          if (a.date > b.date) return -1;
          return 0;
        })
        .map(post => {
          const postDateTime = new Date(post.date).toLocaleString('en-US', { timeZone: 'America/Toronto' });
          const postDate = new Date(postDateTime);
          const postYear = postDate.getFullYear();
          return `<li>${postYear}: <a href="proxy.php?url=${post.url}">${post.data.title}</a></li>`;
        })
        .join('\n');
    list = list.length > 0
      ? `<ul>${list}</ul>`
      : `<p>No posts were written on ${date} in previous years.</p>`;

    return `<section><h2>On this day</h2>
<p>This page lists posts written on this day throughout the years. If you've enabled Javascript, it will show the current day. If you don't, it'll show the posts from the day I last updated this blog. You might also like to explore <a href="proxy.php?url=/blog/all">all posts</a>, <a href="proxy.php?url=/topic">a topic-based outline</a> or <a href="proxy.php?url=/blog/category">categories</a>.</p>
<h3 class="date">${date}</h3>
<div id="posts-container">${list}</div>

<script>
  $(document).ready(function() { onThisDay(); });
</script>
</section>`;
  }
};
Client-side Javascript for the dynamic list
function onThisDay() {
  const tz = 'America/Toronto';
  function getEffectiveDate() {
    const urlParams = new URLSearchParams(window.location.search);
    const dateParam = urlParams.get('date');
    if (dateParam && /^\d{2}-\d{2}$/.test(dateParam)) {
      const currentYear = new Date().getFullYear();
      const dateObj = new Date(`${currentYear}-${dateParam}T12:00:00Z`);
      if (dateObj.getTime()) {
        return {
          monthDay: dateParam,
          formatted: dateObj.toLocaleDateString('en-US', { month: 'long', day: 'numeric' })
        };
      }
    }
    const today = new Date(new Date().toLocaleString('en-US', { timeZone: tz }));
    return {
      monthDay: today.toISOString().substring(5, 10), // MM-DD
      formatted: today.toLocaleDateString('en-US', { month: 'long', day: 'numeric' })
    };
  }
  // Fetch and process the posts
  fetch('/blog/all/index.json')
    .then(response => response.json())
    .then(posts => {
      const dateInfo = getEffectiveDate();
      const dateElement = document.querySelector('h3.date');
      if (dateElement) {
        dateElement.textContent = dateInfo.formatted;
      }
      const matchingPosts = posts.filter(post => {
        const postDate = new Date(post.date).toLocaleString('en-US', { timeZone: tz });
        const postMonthDay = (new Date(postDate)).toISOString().substring(5, 10);
        return postMonthDay === dateInfo.monthDay;
      });

      matchingPosts.sort((a, b) => {
        const dateA = new Date(a.date);
        const dateB = new Date(b.date);
        return dateB - dateA;
      });

      const elem = document.getElementById('posts-container');
      if (matchingPosts.length > 0) {
        const postsHTML = matchingPosts.map(post => {
          const postDate = new Date(post.date).toLocaleString('en-US', { timeZone: tz });
          const postYear = new Date(postDate).getFullYear();
          return `<li>${postYear}: <a href="proxy.php?url=${post.permalink}">${post.title}</a></li>`;
        }).join('\n');
        elem.innerHTML = `<ul>${postsHTML}</ul>`;
      } else {
        elem.innerHTML = `<p>No posts were written on ${dateInfo.formatted}.</p>`;
      }
    })
    .catch(error => {
      console.error('Error fetching posts:', error);
    });
}

I used to include the day's posts as a footer on the individual blog post page. That might be something to consider again.

View org source for this post

Working with smaller chunks of thoughts; adding anchors to paragraphs in Org Mode HTML export

Posted: - Modified: | org, js

I write my blog posts in Org Mode and export them to Eleventy with ox-11ty, which is derived from the ox-html backend.

Sometimes I want to link to something in a different blog post. This lets me build on thoughts that are part of a post instead of being a whole post on their own.

If I haven't added an anchor to the blog post yet, I can add one so that I can link to that section. For really old posts where I don't have an Org source file, I can edit the HTML file directly and add an id="some-id" so that I can link to it with /url/to/post#some-id. Most of my new posts have Org source, though. I have a my-blog-edit-org function and a my-blog-edit-html function in my Emacs configuration to make it easier to jump to the Org file or HTML for a blog post.

If the section has a heading, then it's easy to make that linkable with a custom name. I can use org-set-property to set the CUSTOM_ID property to the anchor name. For example, this voice access section has a heading that has CUSTOM_ID, as you can see in the . If I don't mind having long anchor names, I can use the my-assign-custom-ids function from my config to automatically set them based on the outline path.

my-assign-custom-ids
(defun my-assign-custom-ids ()
  (interactive)
  (let ((custom-ids
         (org-map-entries (lambda () (org-entry-get (point) "CUSTOM_ID")) "CUSTOM_ID={.}")))
    (org-map-entries
     (lambda ()
       (let ((slug
              (replace-regexp-in-string
               "^-\\|-$" ""
               (replace-regexp-in-string "[^A-Za-z0-9]+" "-"
                                         (downcase (string-join (org-get-outline-path t) " "))))))
         (while (member slug custom-ids)
           (setq slug (read-string "Manually set custom ID: ")))
         (org-entry-put (point) "CUSTOM_ID" slug)))
     "-CUSTOM_ID={.}")))

Adding anchors to paragraphs

If the part that I want to link to is not a heading, I can add an ID by using the #+ATTR_HTML: :id ... directive, like this snippet from my reflection on landscapes and art:

  #+ATTR_HTML: :id interest-development
  That reminds me a little of another reflection
  I've been noodling around on interest development...

Text fragments

Text fragments are even more powerful, because I can link to a specific part of a paragraph. I can link to one segment with something like #::text=text+to+highlight~. I can specify multiple text fragments to highlight by using #::text=first+text+to+highlight&text=second+text~, and the browser will automatically scroll to the first highlighted section. I can specify a longer section by using text=textStart,textEnd. Example: #:~:text=That%20is%20the%20gap,described The text fragments documentation has more options, including using prefixes and suffixes to disambiguate matches.

Text fragment links require rel="noopener" for security, so I added JKC-Codes/eleventy-plugin-automatic-noopener to my 11ty config.

Update 2025-03-20: Quick ways to link to a text fragment:

  • On my Android phone, selecting text in Google Chrome and sharing it automatically includes the text and a link to the text fragment.
  • In Google Chrome on my iPad, my process is:
    1. Select the text and choose "Copy Link with Highlight".
    2. Tap the selected text again and share it.
    3. Paste the link after the shared text.
  • There's this Text Fragment extension for Firefox.
  • I have some Emacs Lisp to link to currently-selected text using Spookfox. Spookfox connects Emacs to Firefox using a browser extension. Once it's properly set up and connected, it allows Emacs to evaluate things in the Firefox context.

These seem like good starting points for addressing smaller chunks of thoughts.

View org source for this post

Checking caption timing by skimming with Emacs Lisp or JS

| js, emacs, subed

Sometimes automatic subtitle timing tools like Aeneas can get confused by silences, extraneous sounds, filler words, mis-starts, and text that I've edited out of the raw captions for easier readability. It's good to quickly check each caption. I used to listen to captions at 1.5x speed, watching carefully as each caption displayed. This took a fair bit of time and focus, so… it usually didn't happen. Sampling the first second of each caption is faster and requires a little less attention.

Skimming with subed.el

Here's a function that I wrote to play the first second of each subtitle.

(defvar my-subed-skim-msecs 1000 "Number of milliseconds to play when skimming.")
(defun my-subed-skim-starts ()
  (interactive)
  (subed-mpv-unpause)
  (subed-disable-loop-over-current-subtitle)
  (catch 'done
    (while (not (eobp))
      (subed-mpv-jump-to-current-subtitle)
      (let ((ch
             (read-char "(q)uit? " nil (/ my-subed-skim-msecs 1000.0))))
        (when ch
          (throw 'done t)))
      (subed-forward-subtitle-time-start)
      (when (and subed-waveform-minor-mode
                 (not subed-waveform-show-all))
        (subed-waveform-refresh))
      (recenter)))
  (subed-mpv-pause))

Now I can read the lines as the subtitles play, and I can press any key to stop so that I can fix timestamps.

Skimming with Javascript

I also want to check the times on the Web in case there have been caching issues. Here's some Javascript to skim the first second of each cue in the first text track for a video, with some code to make it easy to process the first video in the visible area.

function getVisibleVideo() {
  const videos = document.querySelectorAll('video');
  for (const video of videos) {
    const rect = video.getBoundingClientRect();
    if (
      rect.top >= 0 &&
      rect.left >= 0 &&
      rect.bottom <= (window.innerHeight || document.documentElement.clientHeight) &&
      rect.right <= (window.innerWidth || document.documentElement.clientWidth)
    ) {
      return video;
    }
  }
  return null;
}

async function skimVideo(video=getVisibleVideo(), msecs=1000) {
  // Get the first text track (assumed to be captions/subtitles)
  const textTrack = video.textTracks[0];
  if (!textTrack) return;
  const remaining = [...textTrack.cues].filter((cue) => cue.endTime >= video.currentTime);
  video.play();
  // Play the first 1 second of each visible subtitle
  for (let i = 0; i < remaining.length && !video.paused; i++) {
    video.currentTime = remaining[i].startTime;
    await new Promise((resolve) => setTimeout(resolve, msecs));
  }
}

Then I can call it with skimVideo();. Actually, in our backstage area, it might be useful to add a Skim button so that I can skim things from my phone.

function handleSkimButton(event) {
   const vid = event.target.closest('.vid').querySelector('video');
   skimVideo(vid);
 }

document.querySelectorAll('video').forEach((vid) => {
   const div = document.createElement('div');
   const skim = document.createElement('button');
   skim.textContent = 'Skim';
   div.appendChild(skim);
   vid.parentNode.insertBefore(div, vid.nextSibling);
   skim.addEventListener('click', handleSkimButton);
});

Results

How much faster is it this way?

Some code to help figure out the speedup
(-let* ((files (directory-files "~/proj/emacsconf/2024/cache" t "--main\\.vtt"))
        ((count-subs sum-seconds)
         (-unzip (mapcar
                  (lambda (file)
                    (list
                     (length (subed-parse-file file))
                     (/ (compile-media-get-file-duration-ms
                         (concat (file-name-sans-extension file) ".webm")) 1000.0)))
                  files)))
        (total-seconds (-reduce #'+ sum-seconds))
        (total-subs (-reduce #'+ count-subs)))
  (format "%d files, %.1f hours, %d total captions, speed up of %.1f"
          (length files)
          (/ total-seconds 3600.0)
          total-subs
          (/ total-seconds total-subs)))

It looks like for EmacsConf talks where we typically format captions to be one long line each (< 60 characters), this can be a speed-up of about 4x compared to listening to the video at normal speed. More usefully, it's different enough to get my brain to do it instead of putting it off.

Most of the automatically-generated timestamps are fine. It's just a few that might need tweaking. It's nice to be able to skim them with fewer keystrokes.

View org source for this post

Using Javascript to add a "Copy code" link to source code blocks in my blog posts

| css, js, blogging

I'd like to write about code more often. It's easier for people to try out ideas if they can copy the code without fiddling with selecting the text, especially on mobile browsers, so "Copy code" buttons on source code blocks would be nice. I used this tutorial for adding code buttons as a basis for the following CSS and JS code.

First, let's add the buttons with Javascript. I want the buttons to be visible in the summary line if I'm using the <details /> element. If not, they can go in the div with the org-src-container class.

/* Start of copy code */
// based on https://www.roboleary.net/2022/01/13/copy-code-to-clipboard-blog.html
const copyLabel = 'Copy code';

async function copyCode(block, button) {
  let code = block.querySelector('pre.src');
  let text = code.innerText;
  await navigator.clipboard.writeText(text);
  button.innerText = 'Copied';
  setTimeout(() => {
    button.innerText = copyLabel;
  }, 500);
}

function addCopyCodeButtons() {
  if (!navigator.clipboard) return;
  let blocks = document.querySelectorAll('.org-src-container');
  blocks.forEach((block) => {
    let button = document.createElement('button');
    button.innerText = copyLabel;
    button.classList.add('copy-code');
    let details = block.closest('details');
    let summary = details && details.querySelector('summary');
    if (summary) {
      summary.appendChild(button);
    } else {
      block.appendChild(button);
    }
    button.addEventListener('click', async() => {
      await copyCode(block, button);
    });
    block.setAttribute('tabindex', 0);
  });
}
document.addEventListener("DOMContentLoaded", function(event) { 
  addCopyCodeButtons();
});
/* End of copy code */

Then we style it:

/* Start of copy code */
pre.src { margin: 0 }
.org-src-container {
    position: relative;
    margin: 0 0;
    padding: 1.75rem 0 1.75rem 1rem;
}
summary { position: relative; }
summary .org-src-container { padding: 0 }
summary .org-src-container pre.src { margin: 0 }
.org-src-container button.copy-code, summary button.copy-code {
    position: absolute;
    top: 0px;
    right: 0px;
}
/* End of copy code */

Someday I'll figure out how to make it easier to tangle things to the post's directory and make the file available for download. In the meantime, this might be a good start.