|
| 1 | +import os |
| 2 | +import time |
| 3 | +import json |
| 4 | +import numpy as np |
| 5 | +from feather_db import DB, Metadata, ContextType, ScoringConfig, FilterBuilder |
| 6 | + |
| 7 | +# ----------------------------------------------------------------------------- |
| 8 | +# 1. Deterministic Mock Embedding |
| 9 | +# ----------------------------------------------------------------------------- |
| 10 | +# We'll map specific keywords to vector dimensions to simulate "meaning". |
| 11 | +# Dim 0: "work/business" |
| 12 | +# Dim 1: "personal/social" |
| 13 | +# Dim 2: "tech/coding" |
| 14 | +# Dim 3: "urgent/important" |
| 15 | +# Dim 4: "finance/money" |
| 16 | +DIM = 5 |
| 17 | + |
| 18 | +def mock_embed(text): |
| 19 | + text = text.lower() |
| 20 | + vec = np.zeros(DIM, dtype=np.float32) |
| 21 | + |
| 22 | + if any(w in text for w in ["meeting", "work", "campaign", "project", "quarter", "deadline"]): |
| 23 | + vec[0] += 1.0 |
| 24 | + if any(w in text for w in ["party", "dinner", "friend", "birthday", "movie"]): |
| 25 | + vec[1] += 1.0 |
| 26 | + if any(w in text for w in ["code", "python", "cpp", "db", "api", "bug", "feature"]): |
| 27 | + vec[2] += 1.0 |
| 28 | + if any(w in text for w in ["urgent", "asap", "critical", "blocking"]): |
| 29 | + vec[3] += 1.0 |
| 30 | + if any(w in text for w in ["budget", "cost", "price", "invoice", "salary"]): |
| 31 | + vec[4] += 1.0 |
| 32 | + |
| 33 | + # Normalize |
| 34 | + norm = np.linalg.norm(vec) |
| 35 | + if norm > 0: |
| 36 | + vec = vec / norm |
| 37 | + else: |
| 38 | + # random small noise if no keywords match, to avoid zero vectors issues |
| 39 | + vec = np.random.rand(DIM).astype(np.float32) * 0.1 |
| 40 | + |
| 41 | + return vec |
| 42 | + |
| 43 | +# ----------------------------------------------------------------------------- |
| 44 | +# 2. Dataset Generation |
| 45 | +# ----------------------------------------------------------------------------- |
| 46 | +NOW = int(time.time()) |
| 47 | +DAY = 86400 |
| 48 | + |
| 49 | +DATASET = [ |
| 50 | + # Recent highly important work |
| 51 | + { |
| 52 | + "id": 1, |
| 53 | + "text": "Urgent bug fix needed for API auth in Phase 2", |
| 54 | + "type": ContextType.FACT, |
| 55 | + "source": "slack:dev-team", |
| 56 | + "importance": 1.0, |
| 57 | + "age_days": 0.1, # Just now |
| 58 | + "tags": ["bug", "api", "v2"] |
| 59 | + }, |
| 60 | + # Older work meeting |
| 61 | + { |
| 62 | + "id": 2, |
| 63 | + "text": "Q1 Planning meeting notes: focus on stability", |
| 64 | + "type": ContextType.EVENT, |
| 65 | + "source": "gcal", |
| 66 | + "importance": 0.8, |
| 67 | + "age_days": 30, # 1 month ago |
| 68 | + "tags": ["planning", "meeting"] |
| 69 | + }, |
| 70 | + # Personal info |
| 71 | + { |
| 72 | + "id": 3, |
| 73 | + "text": "User prefers dark mode and high contrast", |
| 74 | + "type": ContextType.PREFERENCE, |
| 75 | + "source": "settings_ui", |
| 76 | + "importance": 1.0, |
| 77 | + "age_days": 100, # Old but permanent preference |
| 78 | + "tags": ["ui", "a11y"] |
| 79 | + }, |
| 80 | + # Code snippet |
| 81 | + { |
| 82 | + "id": 4, |
| 83 | + "text": "Python script to migrate database schema", |
| 84 | + "type": ContextType.FACT, |
| 85 | + "source": "github", |
| 86 | + "importance": 0.5, |
| 87 | + "age_days": 2, |
| 88 | + "tags": ["python", "migration"] |
| 89 | + }, |
| 90 | + # Irrelevant noise |
| 91 | + { |
| 92 | + "id": 5, |
| 93 | + "text": "Dinner receipt for pizza party", |
| 94 | + "type": ContextType.EVENT, |
| 95 | + "source": "email", |
| 96 | + "importance": 0.1, |
| 97 | + "age_days": 5, |
| 98 | + "tags": ["finance", "food"] |
| 99 | + }, |
| 100 | + { |
| 101 | + "id": 6, |
| 102 | + "text": "Invoice for cloud hosting services", |
| 103 | + "type": ContextType.FACT, |
| 104 | + "source": "email", |
| 105 | + "importance": 0.9, |
| 106 | + "age_days": 1, |
| 107 | + "tags": ["finance", "cloud"] |
| 108 | + }, |
| 109 | +] |
| 110 | + |
| 111 | +# ----------------------------------------------------------------------------- |
| 112 | +# 3. Setup and Population |
| 113 | +# ----------------------------------------------------------------------------- |
| 114 | +DB_PATH = "test_db_advanced.feather" |
| 115 | +if os.path.exists(DB_PATH): |
| 116 | + os.remove(DB_PATH) |
| 117 | + |
| 118 | +print(f"🚀 Initializing Feather DB at {DB_PATH} with dim={DIM}") |
| 119 | +db = DB.open(DB_PATH, DIM) |
| 120 | + |
| 121 | +print(f"📥 Ingesting {len(DATASET)} diverse records...") |
| 122 | +for item in DATASET: |
| 123 | + meta = Metadata() |
| 124 | + meta.content = item["text"] |
| 125 | + meta.type = item["type"] |
| 126 | + meta.source = item["source"] |
| 127 | + meta.importance = item["importance"] |
| 128 | + meta.timestamp = int(NOW - (item["age_days"] * DAY)) |
| 129 | + meta.tags_json = json.dumps(item["tags"]) |
| 130 | + |
| 131 | + vec = mock_embed(item["text"]) |
| 132 | + db.add(item["id"], vec, meta) |
| 133 | + |
| 134 | +# ----------------------------------------------------------------------------- |
| 135 | +# 4. Scenario Testing |
| 136 | +# ----------------------------------------------------------------------------- |
| 137 | + |
| 138 | +def run_query(test_name, query_text, filter_obj=None, scoring_obj=None): |
| 139 | + print(f"\n🧪 Test: {test_name}") |
| 140 | + print(f" Query: '{query_text}'") |
| 141 | + q_vec = mock_embed(query_text) |
| 142 | + |
| 143 | + # Use the passed configuration directly |
| 144 | + real_scoring = scoring_obj |
| 145 | + |
| 146 | + results = db.search(q_vec, k=3, filter=filter_obj, scoring=real_scoring) |
| 147 | + |
| 148 | + for i, r in enumerate(results): |
| 149 | + age_days = (NOW - r.metadata.timestamp) / DAY |
| 150 | + print(f" {i+1}. [Score: {r.score:.4f}] [Age: {age_days:.1f}d] [{r.metadata.type}] {r.metadata.content}") |
| 151 | + |
| 152 | +# Scenario A: Baseline Similarity (Is "bug" related to "code"?) |
| 153 | +# Expect: Bug fix (ID 1) and Migration script (ID 4) to appear. |
| 154 | +run_query("Baseline Context Retrieval (Similarity Only)", "coding bug issue", scoring_obj=ScoringConfig(half_life=365, weight=0.0)) |
| 155 | + |
| 156 | +# Scenario B: Finding the most URGENT recent item |
| 157 | +# Expect: The "Urgent bug fix" (ID 1) should be #1 significantly because of recency + importance. |
| 158 | +run_query("Urgent Recent Items (High Temporal Weight)", "urgent issue", scoring_obj=ScoringConfig(half_life=1, weight=0.5)) |
| 159 | + |
| 160 | +# Scenario C: "Recall" context from a month ago |
| 161 | +# Expect: Q1 Planning meeting (ID 2). |
| 162 | +run_query("Memory Recall (Older Events)", "planning meeting", scoring_obj=ScoringConfig(half_life=60, weight=0.1)) |
| 163 | + |
| 164 | +# Scenario D: Filtering for specific source (e.g. only 'email') |
| 165 | +# Expect: Invoice (ID 6) and Pizza receipt (ID 5). |
| 166 | +fb = FilterBuilder() |
| 167 | +email_filter = fb.types([ContextType.FACT, ContextType.EVENT]).source("email").build() |
| 168 | +run_query("Source Filtering (Emails only)", "money cost", filter_obj=email_filter) |
| 169 | + |
| 170 | +# Scenario E: Filtering for Preferences (User personalization) |
| 171 | +# Expect: Only ID 3. |
| 172 | +pref_filter = fb.types(ContextType.PREFERENCE).build() |
| 173 | +run_query("Personalization (Preferences only)", "ui setting", filter_obj=pref_filter) |
| 174 | + |
| 175 | +print("\n✅ Verification Complete.") |
0 commit comments