Skip to content

Commit 7437f29

Browse files
author
Rahulkumarsharma01
committed
Fix: Shorten embedding filenames using model_id_to_filename (#122)
1 parent fd7ae2a commit 7437f29

File tree

1 file changed

+31
-23
lines changed

1 file changed

+31
-23
lines changed

cookbook/populate_embeddings.ipynb

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,12 @@
2424
},
2525
{
2626
"cell_type": "code",
27-
"execution_count": 1,
27+
"execution_count": 25,
2828
"id": "c5498911",
2929
"metadata": {
3030
"id": "c5498911"
3131
},
32-
"outputs": [
33-
{
34-
"name": "stderr",
35-
"output_type": "stream",
36-
"text": [
37-
"2025-05-27 13:21:11.840076: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
38-
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
39-
]
40-
}
41-
],
32+
"outputs": [],
4233
"source": [
4334
"import os\n",
4435
"import os.path\n",
@@ -50,7 +41,10 @@
5041
"import math\n",
5142
"# import numpy as np\n",
5243
"import pandas as pd\n",
53-
"from sentence_transformers import SentenceTransformer"
44+
"from sentence_transformers import SentenceTransformer\n",
45+
"\n",
46+
"def model_id_to_filename(model_id):\n",
47+
" return model_id.split(\"/\")[-1].lower()"
5448
]
5549
},
5650
{
@@ -65,7 +59,7 @@
6559
},
6660
{
6761
"cell_type": "code",
68-
"execution_count": 2,
62+
"execution_count": 26,
6963
"id": "45b95c55",
7064
"metadata": {
7165
"id": "45b95c55"
@@ -85,7 +79,7 @@
8579
},
8680
{
8781
"cell_type": "code",
88-
"execution_count": 3,
82+
"execution_count": 27,
8983
"id": "b87a3c65-0e08-4fa9-aa8f-2f9a2f6c3499",
9084
"metadata": {
9185
"colab": {
@@ -101,7 +95,7 @@
10195
"False"
10296
]
10397
},
104-
"execution_count": 3,
98+
"execution_count": 27,
10599
"metadata": {},
106100
"output_type": "execute_result"
107101
}
@@ -122,7 +116,7 @@
122116
},
123117
{
124118
"cell_type": "code",
125-
"execution_count": 4,
119+
"execution_count": 28,
126120
"id": "95fb523c",
127121
"metadata": {
128122
"id": "95fb523c"
@@ -150,7 +144,7 @@
150144
},
151145
{
152146
"cell_type": "code",
153-
"execution_count": 5,
147+
"execution_count": 29,
154148
"id": "cd09f66b",
155149
"metadata": {
156150
"id": "cd09f66b"
@@ -220,7 +214,7 @@
220214
},
221215
{
222216
"cell_type": "code",
223-
"execution_count": 6,
217+
"execution_count": 30,
224218
"id": "87316fa4-1fcf-41c4-9913-bc5704b25ea2",
225219
"metadata": {
226220
"colab": {
@@ -248,19 +242,33 @@
248242
"\n",
249243
"Opening existing file locally: ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n",
250244
"Request url: https://router.huggingface.co/hf-inference/models/BAAI/bge-large-en-v1.5/pipeline/feature-extraction\n",
251-
"Dimensions from hugging face API response: 1024\n",
245+
"Dimensions from hugging face API response: 1\n",
252246
"Dimensions from json file: 1024\n",
253247
"Old prompts: 2217\n",
254248
"New prompts: 0\n",
255249
"Errors: 0\n",
256250
"Successes: 0\n",
257-
"Updating centroids.\n",
251+
"Updating centroids.\n"
252+
]
253+
},
254+
{
255+
"name": "stderr",
256+
"output_type": "stream",
257+
"text": [
258+
"C:\\Users\\Rahul\\AppData\\Local\\Temp\\ipykernel_17512\\3081262251.py:43: UserWarning: Dimensions are different: API=1 while JSON sentences file=1024\n",
259+
" warnings.warn( f\"Dimensions are different: API={api_response_dimensions} while JSON sentences file={json_file_dimensions}\" )\n"
260+
]
261+
},
262+
{
263+
"name": "stdout",
264+
"output_type": "stream",
265+
"text": [
258266
"Saving into file: ../prompt-sentences-main/prompt_sentences-bge-large-en-v1.5.json\n",
259267
"\n",
260268
"\n",
261269
"Opening existing file locally: ../prompt-sentences-main/prompt_sentences-multilingual-e5-large.json\n",
262270
"Request url: https://router.huggingface.co/hf-inference/models/intfloat/multilingual-e5-large/pipeline/feature-extraction\n",
263-
"Dimensions from hugging face API response: 1024\n",
271+
"Dimensions from hugging face API response: 1\n",
264272
"Dimensions from json file: 1024\n",
265273
"Old prompts: 2217\n",
266274
"New prompts: 0\n",
@@ -458,7 +466,7 @@
458466
"provenance": []
459467
},
460468
"kernelspec": {
461-
"display_name": "Python 3 (ipykernel)",
469+
"display_name": "Python 3",
462470
"language": "python",
463471
"name": "python3"
464472
},
@@ -472,7 +480,7 @@
472480
"name": "python",
473481
"nbconvert_exporter": "python",
474482
"pygments_lexer": "ipython3",
475-
"version": "3.9.6"
483+
"version": "3.13.2"
476484
}
477485
},
478486
"nbformat": 4,

0 commit comments

Comments
 (0)