By Xavier Collantes
8/14/2025
1import pickle
2import faiss
3import numpy as np
4from pathlib import Path
5
6class FAISSVectorStore:
7 def __init__(self, dimension: int, index_path: str = "vector_index.faiss"):
8 self.dimension = dimension
9 self.index_path = Path(index_path)
10 self.metadata_path = Path(str(index_path).replace('.faiss', '_metadata.pkl'))
11
12 # Initialize FAISS index (HNSW for good speed/accuracy balance)
13 self.index = faiss.IndexHNSWFlat(dimension, 32)
14 self.metadata = []
15
16 # Load existing index if available
17 if self.index_path.exists():
18 self.load_index()
19
20 def add_batch(self, vectors: np.ndarray, metadata_list: List[dict]):
21 """Add multiple vectors efficiently."""
22 # Ensure vectors are float32 (FAISS requirement)
23 vectors = vectors.astype(np.float32)
24
25 # Add to FAISS index
26 self.index.add(vectors)
27
28 # Store metadata
29 self.metadata.extend(metadata_list)
30
31 # Save to disk
32 self.save_index()
33
34 def search(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[float, dict]]:
35 """Search for similar vectors."""
36 query_vector = query_vector.astype(np.float32).reshape(1, -1)
37
38 # FAISS search returns distances and indices
39 distances, indices = self.index.search(query_vector, top_k)
40
41 results = []
42 for distance, idx in zip(distances[0], indices[0]):
43 if idx != -1: # Valid result
44 # Convert distance to similarity (for cosine: similarity = 1 - distance)
45 similarity = 1 - distance
46 results.append((similarity, self.metadata[idx]))
47
48 return results
49
50 def save_index(self):
51 """Persist index and metadata to disk."""
52 faiss.write_index(self.index, str(self.index_path))
53 with open(self.metadata_path, 'wb') as f:
54 pickle.dump(self.metadata, f)
55
56 def load_index(self):
57 """Load index and metadata from disk."""
58 self.index = faiss.read_index(str(self.index_path))
59 with open(self.metadata_path, 'rb') as f:
60 self.metadata = pickle.load(f)
61
62# Usage example with realistic data
63from sentence_transformers import SentenceTransformer
64
65# Initialize embedding model and vector store
66model = SentenceTransformer('all-MiniLM-L6-v2')
67store = FAISSVectorStore(dimension=384) # MiniLM embedding dimension
68
69# Sample documents
70documents = [
71 "Python is a powerful programming language",
72 "Machine learning enables computers to learn",
73 "Vector databases store high-dimensional data",
74 "FAISS provides efficient similarity search",
75 "Natural language processing analyzes text"
76]
77
78# Generate embeddings and store
79embeddings = model.encode(documents)
80metadata = [{"text": doc, "id": i} for i, doc in enumerate(documents)]
81
82store.add_batch(embeddings, metadata)
83
84# Search example
85query = "programming languages and coding"
86query_embedding = model.encode([query])
87results = store.search(query_embedding[0], top_k=3)
88
89print(f"Query: {query}")
90for similarity, meta in results:
91 print(f" Similarity: {similarity:.3f} - {meta['text']}")
92
1from qdrant_client import QdrantClient
2from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
3
4class ProductionVectorStore:
5 def __init__(self, host: str = "localhost", port: int = 6333):
6 self.client: QdrantClient = QdrantClient(host=host, port=port)
7 self.collection_name = "documents"
8
9 # Create collection with optimized settings
10 try:
11 self.client.create_collection(
12 collection_name=self.collection_name,
13 vectors_config=VectorParams(
14 size=384, # Embedding dimension
15 distance=Distance.COSINE
16 ),
17 # Performance optimizations
18 optimizers_config={
19 "default_segment_number": 2,
20 "max_segment_size": 200000,
21 "memmap_threshold": 50000,
22 },
23 hnsw_config={
24 "m": 16,
25 "ef_construct": 100,
26 }
27 )
28 except Exception:
29 # Collection already exists
30 pass
31
32 def add_documents(self, documents: List[dict]):
33 """Add documents with embeddings and rich metadata."""
34 points = []
35
36 for i, doc in enumerate(documents):
37 point = PointStruct(
38 id=doc.get("id", i),
39 vector=doc["embedding"],
40 payload={
41 "text": doc["text"],
42 "category": doc.get("category", "general"),
43 "author": doc.get("author", "unknown"),
44 "timestamp": doc.get("timestamp"),
45 "tags": doc.get("tags", [])
46 }
47 )
48 points.append(point)
49
50 # Batch insert with wait for consistency
51 self.client.upsert(
52 collection_name=self.collection_name,
53 points=points,
54 wait=True
55 )
56
57 def search_with_filters(self,
58 query_vector: List[float],
59 category: str = None,
60 author: str = None,
61 tags: List[str] = None,
62 top_k: int = 5) -> List[dict]:
63 """Advanced search with multiple filter conditions."""
64
65 # Build complex filter
66 filter_conditions = []
67
68 if category:
69 filter_conditions.append(
70 FieldCondition(key="category", match=MatchValue(value=category))
71 )
72
73 if author:
74 filter_conditions.append(
75 FieldCondition(key="author", match=MatchValue(value=author))
76 )
77
78 if tags:
79 from qdrant_client.models import MatchAny
80 filter_conditions.append(
81 FieldCondition(key="tags", match=MatchAny(any=tags))
82 )
83
84 # Create filter object
85 search_filter = None
86 if filter_conditions:
87 search_filter = Filter(must=filter_conditions)
88
89 # Perform search
90 results = self.client.search(
91 collection_name=self.collection_name,
92 query_vector=query_vector,
93 query_filter=search_filter,
94 limit=top_k,
95 with_payload=True
96 )
97
98 return [
99 {
100 "id": result.id,
101 "score": result.score,
102 "text": result.payload.get("text"),
103 "category": result.payload.get("category"),
104 "author": result.payload.get("author"),
105 "tags": result.payload.get("tags", [])
106 }
107 for result in results
108 ]
109
110# Usage example
111store = ProductionVectorStore()
112
113# Sample documents with rich metadata
114documents = [
115 {
116 "id": 1,
117 "text": "Introduction to Python programming",
118 "embedding": model.encode("Introduction to Python programming").tolist(),
119 "category": "programming",
120 "author": "jane_doe",
121 "tags": ["python", "tutorial", "beginner"]
122 },
123 {
124 "id": 2,
125 "text": "Advanced machine learning techniques",
126 "embedding": model.encode("Advanced machine learning techniques").tolist(),
127 "category": "ai",
128 "author": "john_smith",
129 "tags": ["ml", "advanced", "algorithms"]
130 },
131 {
132 "id": 3,
133 "text": "Database design principles",
134 "embedding": model.encode("Database design principles").tolist(),
135 "category": "database",
136 "author": "jane_doe",
137 "tags": ["database", "design", "sql"]
138 }
139]
140
141store.add_documents(documents)
142
143# Advanced search examples
144query_embedding = model.encode("python coding tutorial").tolist()
145
146# Search within programming category
147programming_results = store.search_with_filters(
148 query_vector=query_embedding,
149 category="programming",
150 top_k=5
151)
152
153# Search for documents by specific author
154jane_results = store.search_with_filters(
155 query_vector=query_embedding,
156 author="jane_doe",
157 top_k=5
158)
159
160# Search for documents with specific tags
161tutorial_results = store.search_with_filters(
162 query_vector=query_embedding,
163 tags=["tutorial", "beginner"],
164 top_k=5
165)
166
167print("Programming category results:")
168for result in programming_results:
169 print(f" Score: {result['score']:.3f} - {result['text']}")
170
Related by topics: