Skip to content

Commit f70f189

Browse files
committed
feat: add dataset index creation and deletion functions
1 parent ceea85e commit f70f189

File tree

7 files changed

+78
-17
lines changed

7 files changed

+78
-17
lines changed

apps/common/event/listener_manage.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from common.util.lock import try_lock, un_lock
2525
from common.util.page_utils import page_desc
2626
from dataset.models import Paragraph, Status, Document, ProblemParagraphMapping, TaskType, State
27+
from dataset.serializers.common_serializers import create_dataset_index
2728
from embedding.models import SourceType, SearchMode
2829
from smartdoc.conf import PROJECT_DIR
2930
from django.utils.translation import gettext_lazy as _
@@ -281,6 +282,8 @@ def is_the_task_interrupted():
281282
ListenerManagement.get_aggregation_document_status(
282283
document_id)),
283284
is_the_task_interrupted)
285+
# 检查是否存在索引
286+
create_dataset_index(document_id=document_id)
284287
except Exception as e:
285288
max_kb_error.error(_('Vectorized document: {document_id} error {error} {traceback}').format(
286289
document_id=document_id, error=str(e), traceback=traceback.format_exc()))

apps/dataset/serializers/common_serializers.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818

1919
from common.config.embedding_config import ModelManage
2020
from common.db.search import native_search
21-
from common.db.sql_execute import update_execute
21+
from common.db.sql_execute import update_execute, sql_execute
2222
from common.exception.app_exception import AppApiException
2323
from common.mixins.api_mixin import ApiMixin
2424
from common.util.field_message import ErrMessage
2525
from common.util.file_util import get_file_content
2626
from common.util.fork import Fork
27-
from dataset.models import Paragraph, Problem, ProblemParagraphMapping, DataSet, File, Image
27+
from dataset.models import Paragraph, Problem, ProblemParagraphMapping, DataSet, File, Image, Document
2828
from setting.models_provider import get_model
2929
from smartdoc.conf import PROJECT_DIR
3030
from django.utils.translation import gettext_lazy as _
@@ -224,6 +224,46 @@ def get_embedding_model_id_by_dataset_id_list(dataset_id_list: List):
224224
return str(dataset_list[0].embedding_mode_id)
225225

226226

227+
228+
def create_dataset_index(dataset_id=None, document_id=None):
229+
if dataset_id is None and document_id is None:
230+
raise AppApiException(500, _('Dataset ID or Document ID must be provided'))
231+
232+
if dataset_id is not None:
233+
k_id = dataset_id
234+
else:
235+
document = QuerySet(Document).filter(id=document_id).first()
236+
k_id = document.dataset_id
237+
238+
sql = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename = 'embedding' AND indexname = 'embedding_hnsw_idx_{k_id}'"
239+
index = sql_execute(sql, [])
240+
if not index:
241+
sql = f"SELECT vector_dims(embedding) AS dims FROM embedding WHERE dataset_id = '{k_id}' LIMIT 1"
242+
result = sql_execute(sql, [])
243+
if len(result) == 0:
244+
return
245+
dims = result[0]['dims']
246+
sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_cosine_ops) WHERE dataset_id = '{k_id}'"""
247+
update_execute(sql, [])
248+
249+
250+
def drop_dataset_index(dataset_id=None, document_id=None):
251+
if dataset_id is None and document_id is None:
252+
raise AppApiException(500, _('Dataset ID or Document ID must be provided'))
253+
254+
if dataset_id is not None:
255+
k_id = dataset_id
256+
else:
257+
document = QuerySet(Document).filter(id=document_id).first()
258+
k_id = document.dataset_id
259+
260+
sql = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename = 'embedding' AND indexname = 'embedding_hnsw_idx_{k_id}'"
261+
index = sql_execute(sql, [])
262+
if index:
263+
sql = f'DROP INDEX "embedding_hnsw_idx_{k_id}"'
264+
update_execute(sql, [])
265+
266+
227267
class GenerateRelatedSerializer(ApiMixin, serializers.Serializer):
228268
model_id = serializers.UUIDField(required=True, error_messages=ErrMessage.uuid(_('Model id')))
229269
prompt = serializers.CharField(required=True, error_messages=ErrMessage.uuid(_('Prompt word')))

apps/dataset/serializers/dataset_serializers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
State, File, Image
4545
from dataset.serializers.common_serializers import list_paragraph, MetaSerializer, ProblemParagraphManage, \
4646
get_embedding_model_by_dataset_id, get_embedding_model_id_by_dataset_id, write_image, zip_dir, \
47-
GenerateRelatedSerializer
47+
GenerateRelatedSerializer, drop_dataset_index
4848
from dataset.serializers.document_serializers import DocumentSerializers, DocumentInstanceSerializer
4949
from dataset.task import sync_web_dataset, sync_replace_web_dataset, generate_related_by_dataset_id
5050
from embedding.models import SearchMode
@@ -788,6 +788,7 @@ def delete(self):
788788
QuerySet(ProblemParagraphMapping).filter(dataset=dataset).delete()
789789
QuerySet(Paragraph).filter(dataset=dataset).delete()
790790
QuerySet(Problem).filter(dataset=dataset).delete()
791+
drop_dataset_index(knowledge_id=dataset.id)
791792
dataset.delete()
792793
delete_embedding_by_dataset(self.data.get('id'))
793794
return True

apps/embedding/sql/blend_search.sql

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@ SELECT
55
FROM
66
(
77
SELECT DISTINCT ON
8-
( "paragraph_id" ) ( similarity ),* ,
9-
similarity AS comprehensive_score
8+
( "paragraph_id" ) ( 1 - distince + ts_similarity ) as similarity, *,
9+
(1 - distince + ts_similarity) AS comprehensive_score
1010
FROM
1111
(
1212
SELECT
1313
*,
14-
(( 1 - ( embedding.embedding <=> %s ) )+ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS similarity
14+
(embedding.embedding::vector(%s) <=> %s) as distince,
15+
(ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS ts_similarity
1516
FROM
1617
embedding ${embedding_query}
18+
ORDER BY distince
1719
) TEMP
1820
ORDER BY
1921
paragraph_id,

apps/embedding/sql/embedding_search.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ SELECT
55
FROM
66
(
77
SELECT DISTINCT ON
8-
("paragraph_id") ( similarity ),* ,similarity AS comprehensive_score
8+
("paragraph_id") ( 1 - distince ),* ,(1 - distince) AS comprehensive_score
99
FROM
10-
( SELECT *, ( 1 - ( embedding.embedding <=> %s ) ) AS similarity FROM embedding ${embedding_query}) TEMP
10+
( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distince FROM embedding ${embedding_query} ORDER BY distince) TEMP
1111
ORDER BY
1212
paragraph_id,
13-
similarity DESC
13+
distince
1414
) DISTINCT_TEMP
1515
WHERE comprehensive_score>%s
1616
ORDER BY comprehensive_score DESC

apps/embedding/task/embedding.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from common.event import ListenerManagement, UpdateProblemArgs, UpdateEmbeddingDatasetIdArgs, \
1818
UpdateEmbeddingDocumentIdArgs
1919
from dataset.models import Document, TaskType, State
20+
from dataset.serializers.common_serializers import drop_dataset_index
2021
from ops import celery_app
2122
from setting.models import Model
2223
from setting.models_provider import get_model
@@ -110,6 +111,7 @@ def embedding_by_dataset(dataset_id, model_id):
110111
max_kb.info(_('Start--->Vectorized dataset: {dataset_id}').format(dataset_id=dataset_id))
111112
try:
112113
ListenerManagement.delete_embedding_by_dataset(dataset_id)
114+
drop_dataset_index(dataset_id=dataset_id)
113115
document_list = QuerySet(Document).filter(dataset_id=dataset_id)
114116
max_kb.info(_('Dataset documentation: {document_names}').format(
115117
document_names=", ".join([d.name for d in document_list])))

apps/embedding/vector/pg_vector.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from abc import ABC, abstractmethod
1313
from typing import Dict, List
1414

15-
import jieba
1615
from django.contrib.postgres.search import SearchVector
1716
from django.db.models import QuerySet, Value
1817
from langchain_core.embeddings import Embeddings
@@ -169,8 +168,13 @@ def handle(self,
169168
os.path.join(PROJECT_DIR, "apps", "embedding", 'sql',
170169
'embedding_search.sql')),
171170
with_table_name=True)
172-
embedding_model = select_list(exec_sql,
173-
[json.dumps(query_embedding), *exec_params, similarity, top_number])
171+
embedding_model = select_list(exec_sql, [
172+
len(query_embedding),
173+
json.dumps(query_embedding),
174+
*exec_params,
175+
similarity,
176+
top_number
177+
])
174178
return embedding_model
175179

176180
def support(self, search_mode: SearchMode):
@@ -190,8 +194,12 @@ def handle(self,
190194
os.path.join(PROJECT_DIR, "apps", "embedding", 'sql',
191195
'keywords_search.sql')),
192196
with_table_name=True)
193-
embedding_model = select_list(exec_sql,
194-
[to_query(query_text), *exec_params, similarity, top_number])
197+
embedding_model = select_list(exec_sql, [
198+
to_query(query_text),
199+
*exec_params,
200+
similarity,
201+
top_number
202+
])
195203
return embedding_model
196204

197205
def support(self, search_mode: SearchMode):
@@ -211,9 +219,14 @@ def handle(self,
211219
os.path.join(PROJECT_DIR, "apps", "embedding", 'sql',
212220
'blend_search.sql')),
213221
with_table_name=True)
214-
embedding_model = select_list(exec_sql,
215-
[json.dumps(query_embedding), to_query(query_text), *exec_params, similarity,
216-
top_number])
222+
embedding_model = select_list(exec_sql, [
223+
len(query_embedding),
224+
json.dumps(query_embedding),
225+
to_query(query_text),
226+
*exec_params,
227+
similarity,
228+
top_number
229+
])
217230
return embedding_model
218231

219232
def support(self, search_mode: SearchMode):

0 commit comments

Comments
 (0)