|
| 1 | +/** |
| 2 | + * Parser for the Vierstra motif clustering v2.0 MEME-format archetype files. |
| 3 | + * |
| 4 | + * MEME format: |
| 5 | + * MOTIF AC0001:DLX/LHX:Homeodomain AC0001:DLX/LHX:Homeodomain |
| 6 | + * |
| 7 | + * letter-probability matrix: alength= 4 w= 6 nsites= 20 E= 0 |
| 8 | + * 0.014812 0.085107 0.008622 0.891459 |
| 9 | + * ... |
| 10 | + * |
| 11 | + * The motif ID is structured as "archetypeId:tfNames:family". |
| 12 | + */ |
| 13 | + |
| 14 | +export interface VierstraMotifRecord { |
| 15 | + motifId: string; // full ID, e.g. "AC0001:DLX/LHX:Homeodomain" |
| 16 | + archetypeId: string; // e.g. "AC0001" |
| 17 | + tfNames: string; // e.g. "DLX/LHX" |
| 18 | + family: string; // e.g. "Homeodomain" |
| 19 | + pfm: { A: number[]; C: number[]; G: number[]; T: number[] }; |
| 20 | +} |
| 21 | + |
| 22 | +/** |
| 23 | + * Parse the consensus_pwms.meme file into an array of motif records. |
| 24 | + * |
| 25 | + * The file contains a header (MEME version, ALPHABET, strands, Background) |
| 26 | + * followed by MOTIF blocks, each with a letter-probability matrix. |
| 27 | + */ |
| 28 | +export function parseMemePwms(content: string): VierstraMotifRecord[] { |
| 29 | + const motifs: VierstraMotifRecord[] = []; |
| 30 | + const lines = content.split(/\r?\n/); |
| 31 | + |
| 32 | + let i = 0; |
| 33 | + while (i < lines.length) { |
| 34 | + const line = lines[i].trim(); |
| 35 | + |
| 36 | + if (line.startsWith("MOTIF ")) { |
| 37 | + // Parse motif ID — format: "MOTIF <id> [<alt>]" |
| 38 | + const motifId = line.substring(6).split(/\s+/)[0]; |
| 39 | + const { archetypeId, tfNames, family } = parseMotifId(motifId); |
| 40 | + |
| 41 | + // Advance to "letter-probability matrix:" line |
| 42 | + i++; |
| 43 | + while (i < lines.length && !lines[i].trim().startsWith("letter-probability matrix:")) { |
| 44 | + i++; |
| 45 | + } |
| 46 | + if (i >= lines.length) break; |
| 47 | + |
| 48 | + // Parse matrix header: "letter-probability matrix: alength= 4 w= 6 ..." |
| 49 | + i++; |
| 50 | + |
| 51 | + // Read matrix rows until blank line or next MOTIF or EOF |
| 52 | + const A: number[] = []; |
| 53 | + const C: number[] = []; |
| 54 | + const G: number[] = []; |
| 55 | + const T: number[] = []; |
| 56 | + |
| 57 | + while (i < lines.length) { |
| 58 | + const row = lines[i].trim(); |
| 59 | + if (row === "" || row.startsWith("MOTIF ") || row.startsWith("URL ")) break; |
| 60 | + |
| 61 | + const vals = row.split(/\s+/).map(Number); |
| 62 | + if (vals.length >= 4 && !isNaN(vals[0])) { |
| 63 | + A.push(vals[0]); |
| 64 | + C.push(vals[1]); |
| 65 | + G.push(vals[2]); |
| 66 | + T.push(vals[3]); |
| 67 | + } |
| 68 | + i++; |
| 69 | + } |
| 70 | + |
| 71 | + if (A.length > 0) { |
| 72 | + motifs.push({ |
| 73 | + motifId, |
| 74 | + archetypeId, |
| 75 | + tfNames, |
| 76 | + family, |
| 77 | + pfm: { A, C, G, T }, |
| 78 | + }); |
| 79 | + } |
| 80 | + } else { |
| 81 | + i++; |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + return motifs; |
| 86 | +} |
| 87 | + |
| 88 | +/** |
| 89 | + * Parse a Vierstra motif ID like "AC0001:DLX/LHX:Homeodomain". |
| 90 | + */ |
| 91 | +function parseMotifId(id: string): { archetypeId: string; tfNames: string; family: string } { |
| 92 | + const parts = id.split(":"); |
| 93 | + if (parts.length >= 3) { |
| 94 | + return { |
| 95 | + archetypeId: parts[0], |
| 96 | + tfNames: parts[1], |
| 97 | + family: parts.slice(2).join(":"), |
| 98 | + }; |
| 99 | + } else if (parts.length === 2) { |
| 100 | + return { archetypeId: parts[0], tfNames: parts[1], family: "Unknown" }; |
| 101 | + } |
| 102 | + return { archetypeId: id, tfNames: id, family: "Unknown" }; |
| 103 | +} |
0 commit comments