Skip to content

Commit 2757f4e

Browse files
committed
v1 - Meta Data with Data Time Decay
1 parent fb06856 commit 2757f4e

28 files changed

Lines changed: 1245 additions & 116 deletions

README.md

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,15 @@
55
## Features
66

77
- 🚀 **High Performance**: Built with C++ and optimized HNSW algorithm
8-
- 🐍 **Python Integration**: Native Python bindings with NumPy support
9-
- 🦀 **Rust CLI**: Command-line interface for easy database operations
10-
- 💾 **Persistent Storage**: Custom binary format with automatic save/load
11-
- 🔍 **Fast Search**: Approximate nearest neighbor search with configurable parameters
12-
- 📦 **Multi-Language**: C++, Python, and Rust APIs
8+
- 🧠 **Context Engine**: Structured metadata storage (Facts, Preferences, Events, Conversations)
9+
-**Temporal Retrieval**: Time-weighted scoring with exponential decay
10+
- 🔍 **Filtered Search**: Domain-logic filtering (by type, source, tags) during HNSW search
11+
- 🐍 **Python Integration**: Native Python bindings with `FilterBuilder` support
12+
- 🦀 **Rust CLI**: Enhanced CLI for metadata and filtered operations
13+
- 💾 **Persistent Storage**: Version 2 binary format with automatic metadata persistence
14+
15+
[![PyPI](https://img.shields.io/pypi/v/feather-db)](https://pypi.org/project/feather-db/)
16+
[![Crates.io](https://img.shields.io/crates/v/feather-db-cli)](https://crates.io/crates/feather-db-cli)
1317

1418
## Quick Start
1519

@@ -36,6 +40,25 @@ for i, (id, dist) in enumerate(zip(ids, distances)):
3640

3741
# Save the database
3842
db.save()
43+
44+
### Context Engine (Phase 2)
45+
46+
```python
47+
from feather_db import DB, Metadata, ContextType, FilterBuilder
48+
49+
# Add with metadata
50+
meta = Metadata()
51+
meta.content = "User prefers dark mode"
52+
meta.type = ContextType.PREFERENCE
53+
meta.importance = 0.9
54+
db.add(id=1, vec=embedding, meta=meta)
55+
56+
# Search with filters and temporal decay
57+
fb = FilterBuilder()
58+
filter = fb.types(ContextType.PREFERENCE).min_importance(0.5).build()
59+
60+
results = db.search(query, k=5, filter=filter, scoring=ScoringConfig(half_life=30))
61+
```
3962
```
4063
4164
### C++ Usage
@@ -78,6 +101,18 @@ feather add my_db.feather 2 --npy vector2.npy
78101
feather search my_db.feather --npy query.npy --k 10
79102
```
80103

104+
### Rust CLI
105+
106+
The CLI is available as a native binary for fast database management.
107+
108+
```bash
109+
# Add with metadata
110+
feather add --npy vector.npy --content "Hello world" --source "cli" my_db 123
111+
112+
# Search with filters
113+
feather search --npy query.npy --type-filter 0 --source-filter "cli" my_db
114+
```
115+
81116
## Installation
82117

83118
### Python Package (Recommended)

bindings/feather.cpp

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,35 +8,65 @@ namespace py = pybind11;
88
PYBIND11_MODULE(core, m) {
99
m.doc() = "Feather: SQLite for Vectors";
1010

11+
py::enum_<feather::ContextType>(m, "ContextType")
12+
.value("FACT", feather::ContextType::FACT)
13+
.value("PREFERENCE", feather::ContextType::PREFERENCE)
14+
.value("EVENT", feather::ContextType::EVENT)
15+
.value("CONVERSATION", feather::ContextType::CONVERSATION)
16+
.export_values();
17+
18+
py::class_<feather::Metadata>(m, "Metadata")
19+
.def(py::init<>())
20+
.def_readwrite("timestamp", &feather::Metadata::timestamp)
21+
.def_readwrite("importance", &feather::Metadata::importance)
22+
.def_readwrite("type", &feather::Metadata::type)
23+
.def_readwrite("source", &feather::Metadata::source)
24+
.def_readwrite("content", &feather::Metadata::content)
25+
.def_readwrite("tags_json", &feather::Metadata::tags_json);
26+
27+
py::class_<feather::ScoringConfig>(m, "ScoringConfig")
28+
.def(py::init<float, float, float>(), py::arg("half_life") = 30.0f, py::arg("weight") = 0.3f, py::arg("min") = 0.0f)
29+
.def_readwrite("decay_half_life_days", &feather::ScoringConfig::decay_half_life_days)
30+
.def_readwrite("time_weight", &feather::ScoringConfig::time_weight)
31+
.def_readwrite("min_weight", &feather::ScoringConfig::min_weight);
32+
33+
py::class_<feather::SearchFilter>(m, "SearchFilter")
34+
.def(py::init<>())
35+
.def_readwrite("types", &feather::SearchFilter::types)
36+
.def_readwrite("source", &feather::SearchFilter::source)
37+
.def_readwrite("source_prefix", &feather::SearchFilter::source_prefix)
38+
.def_readwrite("timestamp_after", &feather::SearchFilter::timestamp_after)
39+
.def_readwrite("timestamp_before", &feather::SearchFilter::timestamp_before)
40+
.def_readwrite("importance_gte", &feather::SearchFilter::importance_gte)
41+
.def_readwrite("tags_contains", &feather::SearchFilter::tags_contains);
42+
43+
py::class_<feather::DB::SearchResult>(m, "SearchResult")
44+
.def_readonly("id", &feather::DB::SearchResult::id)
45+
.def_readonly("score", &feather::DB::SearchResult::score)
46+
.def_readonly("metadata", &feather::DB::SearchResult::metadata);
47+
1148
py::class_<feather::DB, std::unique_ptr<feather::DB, py::nodelete>>(m, "DB")
1249
.def_static("open", &feather::DB::open, py::arg("path"), py::arg("dim") = 768)
1350

14-
.def("add", [](feather::DB& db, uint64_t id, py::array_t<float> vec) {
51+
.def("add", [](feather::DB& db, uint64_t id, py::array_t<float> vec, const std::optional<feather::Metadata>& meta) {
1552
auto buf = vec.request();
1653
if (buf.size != db.dim()) throw std::runtime_error("Dimension mismatch");
1754
const float* ptr = static_cast<const float*>(buf.ptr);
1855
std::vector<float> vec_copy(ptr, ptr + buf.size);
19-
db.add(id, vec_copy);
20-
}, py::arg("id"), py::arg("vec"))
21-
.def("search", [](const feather::DB& db, py::array_t<float> q, size_t k = 5) {
56+
db.add(id, vec_copy, meta ? *meta : feather::Metadata());
57+
}, py::arg("id"), py::arg("vec"), py::arg("meta") = std::nullopt)
58+
59+
.def("search", [](const feather::DB& db, py::array_t<float> q, size_t k = 5,
60+
const feather::SearchFilter* filter = nullptr,
61+
const feather::ScoringConfig* scoring = nullptr) {
2262
auto buf = q.request();
2363
if (buf.size != db.dim()) throw std::runtime_error("Query dimension mismatch");
2464
const float* ptr = static_cast<const float*>(buf.ptr);
2565
std::vector<float> query(ptr, ptr + buf.size);
26-
auto results = db.search(query, k);
27-
28-
py::array_t<uint64_t> ids(results.size());
29-
py::array_t<float> distances(results.size());
30-
auto ids_ptr = ids.mutable_data();
31-
auto dist_ptr = distances.mutable_data();
32-
33-
for (size_t i = 0; i < results.size(); ++i) {
34-
auto [id, dist] = results[i];
35-
ids_ptr[i] = id;
36-
dist_ptr[i] = dist;
37-
}
38-
return py::make_tuple(ids, distances);
39-
}, py::arg("q"), py::arg("k") = 5)
66+
return db.search(query, k, filter, scoring);
67+
}, py::arg("q"), py::arg("k") = 5, py::arg("filter") = nullptr, py::arg("scoring") = nullptr)
68+
69+
.def("get_metadata", &feather::DB::get_metadata, py::arg("id"))
4070
.def("save", &feather::DB::save)
4171
.def("dim", &feather::DB::dim);
4272
}

examples/advanced_context_test.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import os
2+
import time
3+
import json
4+
import numpy as np
5+
from feather_db import DB, Metadata, ContextType, ScoringConfig, FilterBuilder
6+
7+
# -----------------------------------------------------------------------------
8+
# 1. Deterministic Mock Embedding
9+
# -----------------------------------------------------------------------------
10+
# We'll map specific keywords to vector dimensions to simulate "meaning".
11+
# Dim 0: "work/business"
12+
# Dim 1: "personal/social"
13+
# Dim 2: "tech/coding"
14+
# Dim 3: "urgent/important"
15+
# Dim 4: "finance/money"
16+
DIM = 5
17+
18+
def mock_embed(text):
19+
text = text.lower()
20+
vec = np.zeros(DIM, dtype=np.float32)
21+
22+
if any(w in text for w in ["meeting", "work", "campaign", "project", "quarter", "deadline"]):
23+
vec[0] += 1.0
24+
if any(w in text for w in ["party", "dinner", "friend", "birthday", "movie"]):
25+
vec[1] += 1.0
26+
if any(w in text for w in ["code", "python", "cpp", "db", "api", "bug", "feature"]):
27+
vec[2] += 1.0
28+
if any(w in text for w in ["urgent", "asap", "critical", "blocking"]):
29+
vec[3] += 1.0
30+
if any(w in text for w in ["budget", "cost", "price", "invoice", "salary"]):
31+
vec[4] += 1.0
32+
33+
# Normalize
34+
norm = np.linalg.norm(vec)
35+
if norm > 0:
36+
vec = vec / norm
37+
else:
38+
# random small noise if no keywords match, to avoid zero vectors issues
39+
vec = np.random.rand(DIM).astype(np.float32) * 0.1
40+
41+
return vec
42+
43+
# -----------------------------------------------------------------------------
44+
# 2. Dataset Generation
45+
# -----------------------------------------------------------------------------
46+
NOW = int(time.time())
47+
DAY = 86400
48+
49+
DATASET = [
50+
# Recent highly important work
51+
{
52+
"id": 1,
53+
"text": "Urgent bug fix needed for API auth in Phase 2",
54+
"type": ContextType.FACT,
55+
"source": "slack:dev-team",
56+
"importance": 1.0,
57+
"age_days": 0.1, # Just now
58+
"tags": ["bug", "api", "v2"]
59+
},
60+
# Older work meeting
61+
{
62+
"id": 2,
63+
"text": "Q1 Planning meeting notes: focus on stability",
64+
"type": ContextType.EVENT,
65+
"source": "gcal",
66+
"importance": 0.8,
67+
"age_days": 30, # 1 month ago
68+
"tags": ["planning", "meeting"]
69+
},
70+
# Personal info
71+
{
72+
"id": 3,
73+
"text": "User prefers dark mode and high contrast",
74+
"type": ContextType.PREFERENCE,
75+
"source": "settings_ui",
76+
"importance": 1.0,
77+
"age_days": 100, # Old but permanent preference
78+
"tags": ["ui", "a11y"]
79+
},
80+
# Code snippet
81+
{
82+
"id": 4,
83+
"text": "Python script to migrate database schema",
84+
"type": ContextType.FACT,
85+
"source": "github",
86+
"importance": 0.5,
87+
"age_days": 2,
88+
"tags": ["python", "migration"]
89+
},
90+
# Irrelevant noise
91+
{
92+
"id": 5,
93+
"text": "Dinner receipt for pizza party",
94+
"type": ContextType.EVENT,
95+
"source": "email",
96+
"importance": 0.1,
97+
"age_days": 5,
98+
"tags": ["finance", "food"]
99+
},
100+
{
101+
"id": 6,
102+
"text": "Invoice for cloud hosting services",
103+
"type": ContextType.FACT,
104+
"source": "email",
105+
"importance": 0.9,
106+
"age_days": 1,
107+
"tags": ["finance", "cloud"]
108+
},
109+
]
110+
111+
# -----------------------------------------------------------------------------
112+
# 3. Setup and Population
113+
# -----------------------------------------------------------------------------
114+
DB_PATH = "test_db_advanced.feather"
115+
if os.path.exists(DB_PATH):
116+
os.remove(DB_PATH)
117+
118+
print(f"🚀 Initializing Feather DB at {DB_PATH} with dim={DIM}")
119+
db = DB.open(DB_PATH, DIM)
120+
121+
print(f"📥 Ingesting {len(DATASET)} diverse records...")
122+
for item in DATASET:
123+
meta = Metadata()
124+
meta.content = item["text"]
125+
meta.type = item["type"]
126+
meta.source = item["source"]
127+
meta.importance = item["importance"]
128+
meta.timestamp = int(NOW - (item["age_days"] * DAY))
129+
meta.tags_json = json.dumps(item["tags"])
130+
131+
vec = mock_embed(item["text"])
132+
db.add(item["id"], vec, meta)
133+
134+
# -----------------------------------------------------------------------------
135+
# 4. Scenario Testing
136+
# -----------------------------------------------------------------------------
137+
138+
def run_query(test_name, query_text, filter_obj=None, scoring_obj=None):
139+
print(f"\n🧪 Test: {test_name}")
140+
print(f" Query: '{query_text}'")
141+
q_vec = mock_embed(query_text)
142+
143+
# Use the passed configuration directly
144+
real_scoring = scoring_obj
145+
146+
results = db.search(q_vec, k=3, filter=filter_obj, scoring=real_scoring)
147+
148+
for i, r in enumerate(results):
149+
age_days = (NOW - r.metadata.timestamp) / DAY
150+
print(f" {i+1}. [Score: {r.score:.4f}] [Age: {age_days:.1f}d] [{r.metadata.type}] {r.metadata.content}")
151+
152+
# Scenario A: Baseline Similarity (Is "bug" related to "code"?)
153+
# Expect: Bug fix (ID 1) and Migration script (ID 4) to appear.
154+
run_query("Baseline Context Retrieval (Similarity Only)", "coding bug issue", scoring_obj=ScoringConfig(half_life=365, weight=0.0))
155+
156+
# Scenario B: Finding the most URGENT recent item
157+
# Expect: The "Urgent bug fix" (ID 1) should be #1 significantly because of recency + importance.
158+
run_query("Urgent Recent Items (High Temporal Weight)", "urgent issue", scoring_obj=ScoringConfig(half_life=1, weight=0.5))
159+
160+
# Scenario C: "Recall" context from a month ago
161+
# Expect: Q1 Planning meeting (ID 2).
162+
run_query("Memory Recall (Older Events)", "planning meeting", scoring_obj=ScoringConfig(half_life=60, weight=0.1))
163+
164+
# Scenario D: Filtering for specific source (e.g. only 'email')
165+
# Expect: Invoice (ID 6) and Pizza receipt (ID 5).
166+
fb = FilterBuilder()
167+
email_filter = fb.types([ContextType.FACT, ContextType.EVENT]).source("email").build()
168+
run_query("Source Filtering (Emails only)", "money cost", filter_obj=email_filter)
169+
170+
# Scenario E: Filtering for Preferences (User personalization)
171+
# Expect: Only ID 3.
172+
pref_filter = fb.types(ContextType.PREFERENCE).build()
173+
run_query("Personalization (Preferences only)", "ui setting", filter_obj=pref_filter)
174+
175+
print("\n✅ Verification Complete.")

0 commit comments

Comments
 (0)