Skip to content

Commit 4d18b78

Browse files
committed
fix: update SQL queries to improve similarity calculations and indexing
1 parent b3a5dc4 commit 4d18b78

File tree

4 files changed

+29
-14
lines changed

4 files changed

+29
-14
lines changed

apps/knowledge/serializers/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def create_knowledge_index(knowledge_id=None, document_id=None):
242242
if len(result) == 0:
243243
return
244244
dims = result[0]['dims']
245-
sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_l2_ops) WHERE knowledge_id = '{k_id}'"""
245+
sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_cosine_ops) WHERE knowledge_id = '{k_id}'"""
246246
update_execute(sql, [])
247247
maxkb_logger.info(f'Created index for knowledge ID: {k_id}')
248248

apps/knowledge/sql/blend_search.sql

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@ SELECT
55
FROM
66
(
77
SELECT DISTINCT ON
8-
( "paragraph_id" ) ( similarity ),* ,
9-
similarity AS comprehensive_score
8+
( "paragraph_id" ) ( 1 - distince + ts_similarity ) as similarity, *,
9+
(1 - distince + ts_similarity) AS comprehensive_score
1010
FROM
1111
(
1212
SELECT
1313
*,
14-
(( 1 - ( embedding.embedding <=> %s ) )+ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS similarity
14+
(embedding.embedding::vector(%s) <=> %s) as distince,
15+
(ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS ts_similarity
1516
FROM
1617
embedding ${embedding_query}
18+
ORDER BY distince
1719
) TEMP
1820
ORDER BY
1921
paragraph_id,

apps/knowledge/sql/embedding_search.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ SELECT
55
FROM
66
(
77
SELECT DISTINCT ON
8-
("paragraph_id") ( similarity ),* ,similarity AS comprehensive_score
8+
("paragraph_id") ( 1 - distince ),* ,(1 - distince) AS comprehensive_score
99
FROM
10-
( SELECT *, ( 1 - ( embedding.embedding <=> %s ) ) AS similarity FROM embedding ${embedding_query}) TEMP
10+
( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distince FROM embedding ${embedding_query} ORDER BY distince) TEMP
1111
ORDER BY
1212
paragraph_id,
13-
similarity DESC
13+
distince
1414
) DISTINCT_TEMP
1515
WHERE comprehensive_score>%s
1616
ORDER BY comprehensive_score DESC

apps/knowledge/vector/pg_vector.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,13 @@ def handle(self,
172172
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
173173
'embedding_search.sql')),
174174
with_table_name=True)
175-
embedding_model = select_list(exec_sql,
176-
[json.dumps(query_embedding), *exec_params, similarity, top_number])
175+
embedding_model = select_list(exec_sql, [
176+
len(query_embedding),
177+
json.dumps(query_embedding),
178+
*exec_params,
179+
similarity,
180+
top_number
181+
])
177182
return embedding_model
178183

179184
def support(self, search_mode: SearchMode):
@@ -193,8 +198,12 @@ def handle(self,
193198
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
194199
'keywords_search.sql')),
195200
with_table_name=True)
196-
embedding_model = select_list(exec_sql,
197-
[to_query(query_text), *exec_params, similarity, top_number])
201+
embedding_model = select_list(exec_sql, [
202+
to_query(query_text),
203+
*exec_params,
204+
similarity,
205+
top_number
206+
])
198207
return embedding_model
199208

200209
def support(self, search_mode: SearchMode):
@@ -214,9 +223,13 @@ def handle(self,
214223
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
215224
'blend_search.sql')),
216225
with_table_name=True)
217-
embedding_model = select_list(exec_sql,
218-
[json.dumps(query_embedding), to_query(query_text), *exec_params, similarity,
219-
top_number])
226+
embedding_model = select_list(exec_sql, [
227+
len(query_embedding),
228+
json.dumps(query_embedding),
229+
to_query(query_text),
230+
*exec_params, similarity,
231+
top_number
232+
])
220233
return embedding_model
221234

222235
def support(self, search_mode: SearchMode):

0 commit comments

Comments
 (0)