Skip to content

Commit 45721cb

Browse files
authored
chore: cleanup (#44)
1 parent efd14ed commit 45721cb

File tree

2 files changed

+20
-28
lines changed

2 files changed

+20
-28
lines changed

lib/embedding-lib.js

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import fs from 'fs'
1+
import fs from 'fs/promises'
2+
import { constants } from 'fs'
23
import path from 'path'
34
import { fileURLToPath } from 'url'
45
import * as ort from 'onnxruntime-web'
@@ -10,18 +11,14 @@ const MODEL_DIR = path.resolve(__dirname, '..', 'models')
1011

1112
const FILES = ['onnx/model.onnx', 'tokenizer.json', 'tokenizer_config.json']
1213

13-
function saveFile(buffer, outputPath) {
14-
return new Promise((resolve, reject) => {
15-
fs.writeFile(outputPath, Buffer.from(buffer), err => {
16-
if (err) reject(err)
17-
else resolve()
18-
})
19-
})
14+
async function saveFile(buffer, outputPath) {
15+
await fs.writeFile(outputPath, Buffer.from(buffer))
2016
}
2117

22-
function fileExists(filePath) {
18+
async function fileExists(filePath) {
2319
try {
24-
return fs.existsSync(filePath)
20+
await fs.access(filePath, constants.F_OK)
21+
return true
2522
} catch {
2623
return false
2724
}
@@ -44,13 +41,15 @@ async function downloadFile(url, outputPath) {
4441
}
4542

4643
async function downloadModelIfNeeded() {
47-
if (!fs.existsSync(MODEL_DIR)) {
48-
fs.mkdirSync(MODEL_DIR, { recursive: true })
44+
try {
45+
await fs.access(MODEL_DIR)
46+
} catch {
47+
await fs.mkdir(MODEL_DIR, { recursive: true })
4948
}
5049

5150
for (const file of FILES) {
5251
const filePath = path.join(MODEL_DIR, path.basename(file))
53-
if (!fileExists(filePath)) {
52+
if (!(await fileExists(filePath))) {
5453
const url = `https://huggingface.co/${MODEL_NAME}/resolve/main/${file}`
5554
await downloadFile(url, filePath)
5655
}
@@ -65,12 +64,8 @@ async function forceRedownloadModel() {
6564
// Delete all model files to force re-download
6665
for (const file of FILES) {
6766
const filePath = path.join(MODEL_DIR, path.basename(file))
68-
try {
69-
if (fileExists(filePath)) {
70-
fs.unlinkSync(filePath)
71-
}
72-
} catch {
73-
// Ignore deletion errors, we'll overwrite anyway
67+
if (await fileExists(filePath)) {
68+
await fs.unlink(filePath).catch(() => {})
7469
}
7570
}
7671

@@ -84,11 +79,11 @@ async function initializeModelAndVocab() {
8479

8580
const loadModelAndVocab = async () => {
8681
// Load model as buffer for onnxruntime-web
87-
const modelBuffer = fs.readFileSync(modelPath)
82+
const modelBuffer = await fs.readFile(modelPath)
8883
session = await ort.InferenceSession.create(modelBuffer)
8984

9085
// Try to parse tokenizer JSON
91-
const tokenizerJson = JSON.parse(fs.readFileSync(vocabPath, 'utf-8'))
86+
const tokenizerJson = JSON.parse(await fs.readFile(vocabPath, 'utf-8'))
9287

9388
// Validate tokenizer structure
9489
if (!tokenizerJson.model || !tokenizerJson.model.vocab) {
@@ -112,7 +107,6 @@ async function initializeModelAndVocab() {
112107
} catch (error) {
113108
// Model or tokenizer is corrupted, force re-download
114109
// eslint-disable-next-line no-console
115-
console.warn('Model corruption detected, re-downloading...', error.message)
116110
await forceRedownloadModel()
117111

118112
// Retry initialization after re-download
@@ -347,15 +341,13 @@ async function processChunkedEmbeddings(chunks, session) {
347341
const validIds = ids.filter(id => {
348342
const isValid = typeof id === 'number' && !isNaN(id) && isFinite(id)
349343
if (!isValid) {
350-
// eslint-disable-next-line no-console
351-
console.warn(`Invalid token ID detected: ${id} (type: ${typeof id})`)
344+
throw new Error(`Invalid token ID detected: ${id} (type: ${typeof id})`)
352345
}
353346
return isValid
354347
})
355348

356349
if (validIds.length !== ids.length) {
357-
// eslint-disable-next-line no-console
358-
console.warn(`Filtered out ${ids.length - validIds.length} invalid token IDs`)
350+
throw new Error(`Found ${ids.length - validIds.length} invalid token IDs`)
359351
}
360352

361353
const inputIds = new BigInt64Array(validIds.map(i => BigInt(i)))

scripts/createEmbeddings.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ async function main() {
2828
// can also run independently, stored/read on file system
2929
// await createSnippets();
3030
await createSnippetsEmbeddings()
31-
// afterwards, copy ./docs/* to https://github.tools.sap/cap/docs-resources -> public/embeddings/
31+
// afterwards, copy to capire @ public/embeddings/
3232
}
3333

3434
async function createSnippetsEmbeddings() {
35-
const chunks = JSON.parse(await fs.readFile('/Users/d065023/SAPDevelop/chunking/code-snippets.json'))
35+
const chunks = JSON.parse(await fs.readFile('<ADAPT!>/code-snippets.json'))
3636
await createEmbeddings(
3737
'code-chunks',
3838
chunks.map(c => chunkToText(c))

0 commit comments

Comments
 (0)