Add Vierstra motif archetypes, fix JASPAR version, improve CIS-BP links

Shaun Mahony · claude · Shaun Mahony · commit 033da20106f9 · 2026-03-01T12:13:00.000-05:00
- Add Vierstra non-redundant TF motif clustering v2.0 as a new database:
  parse MEME-format archetype models grouped by TF family
- Fix JASPAR version display: 2024 → 2026
- Fix CIS-BP: always update urlPattern on existing records so homepage
  links don't go missing on re-sync
- Add CIS-BP per-entry links via TF identifier (baseId) so match result
  names link to the correct TF report page
- Add "vierstra" to ReferenceDatabase source enum and DatabaseSelector

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/web/src/app/admin/page.tsx b/web/src/app/admin/page.tsx
@@ -42,6 +42,10 @@ export default function AdminPage() {
   const [hocomocoResult, setHocomocoResult] = useState<string | null>(null);
   const [hocomocoCollection, setHocomocoCollection] = useState<string>("H14CORE");
 
+  // Vierstra state
+  const [vierstraSyncing, setVierstraSyncing] = useState(false);
+  const [vierstraResult, setVierstraResult] = useState<string | null>(null);
+
   const fetchData = useCallback(async () => {
     const [jobsRes, dbsRes] = await Promise.all([
       fetch("/api/admin/jobs"),
@@ -116,6 +120,30 @@ export default function AdminPage() {
     }
   };
 
+  const handleVierstraSync = async () => {
+    setVierstraSyncing(true);
+    setVierstraResult(null);
+    try {
+      const res = await fetch("/api/admin/sync-vierstra", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+      });
+      const data = await res.json();
+      if (res.ok) {
+        setVierstraResult(
+          `Sync complete: ${data.result.totalStored} archetypes stored from ${data.result.families.length} families, ${data.result.errors.length} errors`
+        );
+        fetchData();
+      } else {
+        setVierstraResult(`Sync failed: ${data.error}`);
+      }
+    } catch (err) {
+      setVierstraResult(`Sync failed: ${err instanceof Error ? err.message : String(err)}`);
+    } finally {
+      setVierstraSyncing(false);
+    }
+  };
+
   const handleCisbpSync = async () => {
     setCisbpSyncing(true);
     setCisbpResult(null);
@@ -363,6 +391,33 @@ export default function AdminPage() {
             <p className="text-sm text-gray-600 mt-2">{hocomocoResult}</p>
           )}
         </div>
+
+        {/* Vierstra Sync Controls */}
+        <div className="border-t border-gray-200 pt-4 mt-4">
+          <h4 className="text-sm font-medium text-gray-900 mb-3">
+            Sync Vierstra Motif Archetypes
+          </h4>
+          <p className="text-xs text-gray-500 mb-3">
+            Downloads consensus archetype models from the{" "}
+            <a
+              href="https://resources.altius.org/~jvierstra/projects/motif-clustering-v2.0beta/"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="underline hover:text-gray-600"
+            >
+              Vierstra non-redundant TF motif clustering v2.0
+            </a>
+            . Motifs are grouped by TF family.
+          </p>
+          <div className="flex items-center gap-3">
+            <Button onClick={handleVierstraSync} disabled={vierstraSyncing}>
+              {vierstraSyncing ? "Syncing..." : "Sync Vierstra Archetypes"}
+            </Button>
+          </div>
+          {vierstraResult && (
+            <p className="text-sm text-gray-600 mt-2">{vierstraResult}</p>
+          )}
+        </div>
       </Card>
     </div>
   );
diff --git a/web/src/app/api/admin/sync-vierstra/route.ts b/web/src/app/api/admin/sync-vierstra/route.ts
@@ -0,0 +1,28 @@
+import { NextRequest, NextResponse } from "next/server";
+import { connectDB } from "@/lib/db/mongoose";
+import { syncVierstra } from "@/lib/vierstra/sync";
+
+function isAdmin(request: NextRequest): boolean {
+  return request.cookies.get("stamp-admin")?.value === "authenticated";
+}
+
+export async function POST(request: NextRequest) {
+  if (!isAdmin(request)) {
+    return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
+  }
+
+  try {
+    await connectDB();
+    const result = await syncVierstra();
+    return NextResponse.json({ success: true, result });
+  } catch (error) {
+    return NextResponse.json(
+      {
+        error: `Vierstra sync failed: ${
+          error instanceof Error ? error.message : String(error)
+        }`,
+      },
+      { status: 500 }
+    );
+  }
+}
diff --git a/web/src/components/job/DatabaseSelector.tsx b/web/src/components/job/DatabaseSelector.tsx
@@ -55,10 +55,11 @@ export function DatabaseSelector({ value, onChange }: DatabaseSelectorProps) {
   );
 
   // Source display names and order
-  const sourceOrder = ["jaspar", "hocomoco", "cisbp", "custom"];
+  const sourceOrder = ["jaspar", "hocomoco", "vierstra", "cisbp", "custom"];
   const sourceLabels: Record<string, string> = {
     jaspar: "JASPAR",
     hocomoco: "HOCOMOCO",
+    vierstra: "Vierstra Archetypes",
     cisbp: "CIS-BP",
     custom: "Custom",
   };
diff --git a/web/src/lib/cisbp/sync.ts b/web/src/lib/cisbp/sync.ts
@@ -18,6 +18,9 @@ export interface CisbpSyncResult {
 const CISBP_BASE_URL =
   "https://cisbp.ccbr.utoronto.ca/data/3_00/DataFiles/Bulk_downloads/EntireDataset";
 
+const CISBP_URL_PATTERN =
+  "https://cisbp.ccbr.utoronto.ca/TFnewreport.php?searchTF={id}";
+
 /**
  * Sync CIS-BP motifs by downloading directly from the CIS-BP server.
  * Uses streaming ZIP extraction to avoid loading large files into memory as strings.
@@ -208,11 +211,16 @@ async function storeCisbpMotifsFromStream(
       source: "cisbp",
       description: "Catalog of Inferred Sequence Binding Preferences",
       version: "Build 3.00",
-      urlPattern: "https://cisbp.ccbr.utoronto.ca/TFreport.php?searchTF={id}",
+      urlPattern: CISBP_URL_PATTERN,
       taxonGroups: [],
       isActive: true,
     });
     await refDb.save();
+  } else {
+    await ReferenceDatabase.updateOne(
+      { _id: refDb._id },
+      { version: "Build 3.00", urlPattern: CISBP_URL_PATTERN }
+    );
   }
 
   const dbId = refDb._id as Types.ObjectId;
@@ -258,6 +266,7 @@ async function storeCisbpMotifsFromStream(
       motifDocs.push({
         databaseRef: dbId,
         matrixId: motifId,
+        baseId: info?.tfId || null,
         name: tfName,
         dbSource: "CIS-BP",
         group: species,
@@ -328,11 +337,16 @@ async function storeCisbpMotifs(
       source: "cisbp",
       description: "Catalog of Inferred Sequence Binding Preferences",
       version: "Build 3.00",
-      urlPattern: "https://cisbp.ccbr.utoronto.ca/TFreport.php?searchTF={id}",
+      urlPattern: CISBP_URL_PATTERN,
       taxonGroups: [],
       isActive: true,
     });
     await refDb.save();
+  } else {
+    await ReferenceDatabase.updateOne(
+      { _id: refDb._id },
+      { version: "Build 3.00", urlPattern: CISBP_URL_PATTERN }
+    );
   }
 
   const dbId = refDb._id as Types.ObjectId;
@@ -358,6 +372,7 @@ async function storeCisbpMotifs(
       motifDocs.push({
         databaseRef: dbId,
         matrixId: motifId,
+        baseId: info?.tfId || null,
         name: tfName,
         dbSource: "CIS-BP",
         group: species,
diff --git a/web/src/lib/db/models/ReferenceDatabase.ts b/web/src/lib/db/models/ReferenceDatabase.ts
@@ -3,7 +3,7 @@ import mongoose, { Schema, Document } from "mongoose";
 export interface IReferenceDatabase extends Document {
   name: string;
   slug: string;
-  source: "jaspar" | "cisbp" | "hocomoco" | "custom";
+  source: "jaspar" | "cisbp" | "hocomoco" | "vierstra" | "custom";
   description: string;
   jasparCollection: string | null;
   version: string | null;
@@ -21,7 +21,7 @@ const ReferenceDatabaseSchema = new Schema<IReferenceDatabase>(
   {
     name: { type: String, required: true },
     slug: { type: String, required: true, unique: true, index: true },
-    source: { type: String, required: true, enum: ["jaspar", "cisbp", "hocomoco", "custom"] },
+    source: { type: String, required: true, enum: ["jaspar", "cisbp", "hocomoco", "vierstra", "custom"] },
     description: { type: String, default: "" },
     jasparCollection: { type: String, default: null },
     version: { type: String, default: null },
diff --git a/web/src/lib/jaspar/sync.ts b/web/src/lib/jaspar/sync.ts
@@ -33,7 +33,7 @@ export async function syncJaspar(options: SyncOptions = {}): Promise<SyncResult>
       source: "jaspar",
       description: `JASPAR ${collection} transcription factor binding profiles`,
       jasparCollection: collection,
-      version: "2024",
+      version: "2026",
       urlPattern: "https://jaspar.elixir.no/matrix/{id}",
       taxonGroups: [],
       isActive: true,
@@ -143,7 +143,7 @@ export async function syncJaspar(options: SyncOptions = {}): Promise<SyncResult>
       lastSyncedAt: new Date(),
       motifCount,
       taxonGroups: storedTaxonGroups,
-      version: "2024",
+      version: "2026",
       urlPattern: "https://jaspar.elixir.no/matrix/{id}",
     }
   );
diff --git a/web/src/lib/vierstra/parser.ts b/web/src/lib/vierstra/parser.ts
@@ -0,0 +1,103 @@
+/**
+ * Parser for the Vierstra motif clustering v2.0 MEME-format archetype files.
+ *
+ * MEME format:
+ *   MOTIF AC0001:DLX/LHX:Homeodomain AC0001:DLX/LHX:Homeodomain
+ *
+ *   letter-probability matrix: alength= 4 w= 6 nsites= 20 E= 0
+ *     0.014812  0.085107  0.008622  0.891459
+ *     ...
+ *
+ * The motif ID is structured as "archetypeId:tfNames:family".
+ */
+
+export interface VierstraMotifRecord {
+  motifId: string;   // full ID, e.g. "AC0001:DLX/LHX:Homeodomain"
+  archetypeId: string; // e.g. "AC0001"
+  tfNames: string;   // e.g. "DLX/LHX"
+  family: string;    // e.g. "Homeodomain"
+  pfm: { A: number[]; C: number[]; G: number[]; T: number[] };
+}
+
+/**
+ * Parse the consensus_pwms.meme file into an array of motif records.
+ *
+ * The file contains a header (MEME version, ALPHABET, strands, Background)
+ * followed by MOTIF blocks, each with a letter-probability matrix.
+ */
+export function parseMemePwms(content: string): VierstraMotifRecord[] {
+  const motifs: VierstraMotifRecord[] = [];
+  const lines = content.split(/\r?\n/);
+
+  let i = 0;
+  while (i < lines.length) {
+    const line = lines[i].trim();
+
+    if (line.startsWith("MOTIF ")) {
+      // Parse motif ID — format: "MOTIF <id> [<alt>]"
+      const motifId = line.substring(6).split(/\s+/)[0];
+      const { archetypeId, tfNames, family } = parseMotifId(motifId);
+
+      // Advance to "letter-probability matrix:" line
+      i++;
+      while (i < lines.length && !lines[i].trim().startsWith("letter-probability matrix:")) {
+        i++;
+      }
+      if (i >= lines.length) break;
+
+      // Parse matrix header: "letter-probability matrix: alength= 4 w= 6 ..."
+      i++;
+
+      // Read matrix rows until blank line or next MOTIF or EOF
+      const A: number[] = [];
+      const C: number[] = [];
+      const G: number[] = [];
+      const T: number[] = [];
+
+      while (i < lines.length) {
+        const row = lines[i].trim();
+        if (row === "" || row.startsWith("MOTIF ") || row.startsWith("URL ")) break;
+
+        const vals = row.split(/\s+/).map(Number);
+        if (vals.length >= 4 && !isNaN(vals[0])) {
+          A.push(vals[0]);
+          C.push(vals[1]);
+          G.push(vals[2]);
+          T.push(vals[3]);
+        }
+        i++;
+      }
+
+      if (A.length > 0) {
+        motifs.push({
+          motifId,
+          archetypeId,
+          tfNames,
+          family,
+          pfm: { A, C, G, T },
+        });
+      }
+    } else {
+      i++;
+    }
+  }
+
+  return motifs;
+}
+
+/**
+ * Parse a Vierstra motif ID like "AC0001:DLX/LHX:Homeodomain".
+ */
+function parseMotifId(id: string): { archetypeId: string; tfNames: string; family: string } {
+  const parts = id.split(":");
+  if (parts.length >= 3) {
+    return {
+      archetypeId: parts[0],
+      tfNames: parts[1],
+      family: parts.slice(2).join(":"),
+    };
+  } else if (parts.length === 2) {
+    return { archetypeId: parts[0], tfNames: parts[1], family: "Unknown" };
+  }
+  return { archetypeId: id, tfNames: id, family: "Unknown" };
+}
diff --git a/web/src/lib/vierstra/sync.ts b/web/src/lib/vierstra/sync.ts
diff --git a/web/worker/processor.ts b/web/worker/processor.ts