Skip to content

Commit 75043ab

Browse files
authored
Merge pull request #510 from kemingy/pgvectors
add benchmark for pgvecto.rs
2 parents a393581 + c86ce6d commit 75043ab

File tree

5 files changed

+144
-0
lines changed

5 files changed

+144
-0
lines changed

.github/workflows/benchmarks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ jobs:
5454
- panng_ngt
5555
- pg_embedding
5656
- pgvector
57+
- pgvecto_rs
5758
- pynndescent
5859
- redisearch
5960
- qdrant

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Evaluated
4444
* [Milvus](https://github.com/milvus-io/milvus) ![https://img.shields.io/github/stars/milvus-io/milvus?style=social](https://img.shields.io/github/stars/milvus-io/milvus?style=social): [Knowhere](https://github.com/milvus-io/knowhere)
4545
* [Zilliz(Glass)](https://github.com/hhy3/pyglass)
4646
* [pgvector](https://github.com/pgvector/pgvector) ![https://img.shields.io/github/stars/pgvector/pgvector?style=social](https://img.shields.io/github/stars/pgvector/pgvector?style=social)
47+
* [pgvecto.rs](https://github.com/tensorchord/pgvecto.rs) ![https://img.shields.io/github/stars/tensorchord/pgvecto.rs?style=social](https://img.shields.io/github/stars/tensorchord/pgvecto.rs?style=social)
4748
* [RediSearch](https://github.com/redisearch/redisearch) ![https://img.shields.io/github/stars/redisearch/redisearch?style=social](https://img.shields.io/github/stars/redisearch/redisearch?style=social)
4849
* [pg_embedding](https://github.com/neondatabase/pg_embedding) ![https://img.shields.io/github/stars/pg_embedding/pg_embedding?style=social](https://img.shields.io/github/stars/neondatabase/pg_embedding?style=social)
4950
* [Descartes(01AI)](https://github.com/xiaoming-01ai/descartes)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM tensorchord/pgvecto-rs:pg16-v0.3.0-alpha.1
2+
3+
# https://github.com/tensorchord/pgvecto.rs
4+
5+
RUN apt-get update \
6+
&& apt-get install -y python3-pip
7+
8+
WORKDIR /home/app
9+
COPY requirements.txt .
10+
11+
RUN python3 -m pip install --break-system-packages -r requirements.txt
12+
RUN python3 -m pip install --break-system-packages psycopg[binary]
13+
14+
COPY run_algorithm.py .
15+
16+
ENV POSTGRES_PASSWORD=password
17+
ENV POSTGRES_USER=postgres
18+
19+
RUN printf '#!/bin/bash\n\
20+
runuser -u postgres -- initdb \n\
21+
runuser -u postgres -- postgres -c shared_preload_libraries=vectors.so &\n\
22+
sleep 5\n\
23+
python3 -u run_algorithm.py "$@"' > entrypoint.sh \
24+
&& chmod u+x entrypoint.sh
25+
26+
ENTRYPOINT ["/home/app/entrypoint.sh"]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
float:
2+
any:
3+
- base_args: ['@metric']
4+
constructor: PGVectoRS
5+
disabled: false
6+
docker_tag: ann-benchmarks-pgvecto_rs
7+
module: ann_benchmarks.algorithms.pgvecto_rs
8+
name: pgvecto_rs
9+
run_groups:
10+
M-16:
11+
arg_groups: [{M: 16, efConstruction: 200}]
12+
args: {}
13+
query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
14+
M-24:
15+
arg_groups: [{M: 24, efConstruction: 200}]
16+
args: {}
17+
query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import struct
2+
import time
3+
4+
import numpy as np
5+
import psycopg
6+
from psycopg.adapt import Dumper, Loader
7+
from psycopg.pq import Format
8+
from psycopg.types import TypeInfo
9+
10+
from ..base.module import BaseANN
11+
12+
13+
class VectorDumper(Dumper):
14+
format = Format.BINARY
15+
16+
def dump(self, obj):
17+
return struct.pack(f"<H{len(obj)}f", len(obj), *obj)
18+
19+
20+
class VectorLoader(Loader):
21+
def load(self, buf):
22+
if isinstance(buf, memoryview):
23+
buf = bytes(buf)
24+
dim = struct.unpack_from("<H", buf)[0]
25+
return np.frombuffer(buf, dtype="<f", count=dim, offset=2)
26+
27+
28+
def register_vector(conn: psycopg.Connection):
29+
info = TypeInfo.fetch(conn=conn, name="vector")
30+
register_vector_type(conn, info)
31+
32+
33+
def register_vector_type(conn: psycopg.Connection, info: TypeInfo):
34+
if info is None:
35+
raise ValueError("vector type not found")
36+
info.register(conn)
37+
38+
class VectorBinaryDumper(VectorDumper):
39+
oid = info.oid
40+
41+
adapters = conn.adapters
42+
adapters.register_dumper(list, VectorBinaryDumper)
43+
adapters.register_dumper(np.ndarray, VectorBinaryDumper)
44+
adapters.register_loader(info.oid, VectorLoader)
45+
46+
47+
class PGVectoRS(BaseANN):
48+
def __init__(self, metric, method_param) -> None:
49+
self.metric = metric
50+
self.m = method_param["M"]
51+
self.ef_construction = method_param["efConstruction"]
52+
self.ef_search = 100
53+
54+
if metric == "angular":
55+
self.query_sql = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s"
56+
self.index_sql = f"CREATE INDEX ON items USING vectors (embedding vector_cos_ops) WITH (options = $$[indexing.hnsw]\nm = {self.m}\nef_construction = {self.ef_construction}$$)"
57+
elif metric == "euclidean":
58+
self.query_sql = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s"
59+
self.index_sql = f"CREATE INDEX ON items USING vectors (embedding vector_l2_ops) WITH (options = $$[indexing.hnsw]\nm = {self.m}\nef_construction = {self.ef_construction}$$)"
60+
else:
61+
raise RuntimeError(f"unknown metric {metric}")
62+
63+
self.connect = psycopg.connect(user="postgres", password="password", autocommit=True)
64+
self.connect.execute("SET search_path = \"$user\", public, vectors")
65+
self.connect.execute("CREATE EXTENSION IF NOT EXISTS vectors")
66+
register_vector(self.connect)
67+
68+
def fit(self, X):
69+
dim = X.shape[1]
70+
71+
cur = self.connect.cursor()
72+
cur.execute("DROP TABLE IF EXISTS items")
73+
cur.execute(f"CREATE TABLE items (id int, embedding vector({dim}))")
74+
with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy:
75+
copy.set_types(["int4", "vector"])
76+
for i, emb in enumerate(X):
77+
copy.write_row((i, emb))
78+
79+
cur.execute(self.index_sql)
80+
print("waiting for indexing to finish...")
81+
for _ in range(3600):
82+
cur.execute("SELECT idx_indexing FROM vectors.pg_vector_index_stat WHERE tablename='items'")
83+
if not cur.fetchone()[0]:
84+
break
85+
time.sleep(10)
86+
87+
def set_query_arguments(self, ef_search):
88+
self.ef_search = ef_search
89+
self.connect.execute(f"SET vectors.hnsw_ef_search = {ef_search}")
90+
91+
def query(self, vec, num):
92+
cur = self.connect.execute(self.query_sql, (vec, num), binary=True, prepare=True)
93+
return [id for (id,) in cur.fetchall()]
94+
95+
def __str__(self):
96+
return (
97+
f"PGVectoRS(metric={self.metric}, m={self.m}, "
98+
f"ef_construction={self.ef_construction}, ef_search={self.ef_search})"
99+
)

0 commit comments

Comments
 (0)