-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindexer.py
More file actions
89 lines (73 loc) · 2.67 KB
/
indexer.py
File metadata and controls
89 lines (73 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
"""
Indexes transcript chunks into a local vector store using Ollama embeddings.
Enables semantic search across all processed lessons.
"""
import os
import json
import logging
import requests
import numpy as np
from pathlib import Path
from typing import List, Dict
# Setup Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
# Constants
WORKDIR = Path.cwd()
TRANSCRIPT_DIR = WORKDIR / "transcripts"
INDEX_FILE = WORKDIR / "vector_index.json"
EMBEDDING_MODEL = "nomic-embed-text" # Great local embedding model
OLLAMA_EMBED_URL = "http://localhost:11434/api/embeddings"
def get_embedding(text: str) -> List[float]:
payload = {
"model": EMBEDDING_MODEL,
"prompt": text
}
try:
r = requests.post(OLLAMA_EMBED_URL, json=payload, timeout=60)
r.raise_for_status()
return r.json()["embedding"]
except Exception as e:
logger.error(f"Embedding failed: {e}")
return []
def index_transcripts():
if not TRANSCRIPT_DIR.exists():
logger.error(f"Transcript directory not found: {TRANSCRIPT_DIR}")
return
index_data = []
# Check if index already exists to avoid re-indexing everything
if INDEX_FILE.exists():
with open(INDEX_FILE, "r") as f:
try:
index_data = json.load(f)
except json.JSONDecodeError:
index_data = []
indexed_files = {item["file"] for item in index_data}
transcript_files = list(TRANSCRIPT_DIR.glob("*.txt"))
logger.info(f"Checking {len(transcript_files)} transcript files...")
for txt_file in transcript_files:
if txt_file.name in indexed_files:
continue
logger.info(f"Indexing: {txt_file.name}")
content = txt_file.read_text(encoding="utf-8")
# Simple chunking for indexing (can be improved)
chunks = content.split("\n\n") # Assume segments or paragraphs
for i, chunk in enumerate(chunks):
chunk = chunk.strip()
if len(chunk) < 50: # Skip very short snippets
continue
embedding = get_embedding(chunk)
if embedding:
index_data.append({
"file": txt_file.name,
"chunk_id": i,
"text": chunk,
"embedding": embedding
})
# Save the index
with open(INDEX_FILE, "w") as f:
json.dump(index_data, f)
logger.info(f"Indexing complete. Total entries in index: {len(index_data)}")
if __name__ == "__main__":
index_transcripts()