Add count_tokens.ps1 script

countzero · countzero · commit 78803c4cd452 · 2024-05-27T08:34:10.000+02:00
diff --git a/README.md b/README.md
@@ -110,7 +110,6 @@ Execute the following to get detailed help on further options of the server scri
 Get-Help -Detailed .\examples\server.ps1
 ```
 
-
 ### Chat via CLI
 
 You can now chat with the model:
@@ -203,6 +202,34 @@ Execute the following to measure the perplexity of the GGML formatted model:
     --file "./vendor/wikitext-2-raw-v1/wikitext-2-raw/wiki.test.raw"
 ```
 
+### Count prompt tokens
+
+You can easily count the tokens of a specific text input for a specific model by using the [.\examples\count_tokens.ps1](./examples/count_tokens.ps1) script:
+
+```PowerShell
+ .\examples\count_tokens.ps1 `
+     -model ".\vendor\llama.cpp\models\openchat-3.5-0106.Q5_K_M.gguf" `
+     -file ".\prompts\chat_with_llm.txt"
+```
+
+To inspect the actual tokenization result you can use the `-debug` flag:
+
+```PowerShell
+ .\examples\count_tokens.ps1 `
+     -model ".\vendor\llama.cpp\models\openchat-3.5-0106.Q5_K_M.gguf" `
+     -prompt "Hello Word!" `
+     -debug
+```
+
+> [!NOTE]
+> The script is a simple wrapper for the [tokenize.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/tokenize/tokenize.cpp) example of the llama.cpp project.
+
+Execute the following to get detailed help on further options of the server script:
+
+```PowerShell
+Get-Help -Detailed .\examples\count_tokens.ps1
+```
+
 ## Build
 
 ### Rebuild llama.cpp
diff --git a/examples/count_tokens.ps1 b/examples/count_tokens.ps1
@@ -0,0 +1,88 @@
+#Requires -Version 5.0
+
+<#
+.SYNOPSIS
+Counts tokens of a prompt file for a specific model.
+
+.DESCRIPTION
+This script counts tokens of a prompt file for a specific model.
+
+.PARAMETER model
+Specifies the path to the GGUF model file.
+
+.PARAMETER file
+Specifies the path to the prompt text file.
+
+.PARAMETER prompt
+Specifies the prompt.
+
+.PARAMETER debug
+Logs the result of the tokenization.
+
+.EXAMPLE
+.\count_tokens.ps1 -model "C:\models\openchat-3.5-0106.Q5_K_M.gguf" -file "C:\prompts\chat_with_llm.txt"
+
+.EXAMPLE
+.\count_tokens.ps1 -model "C:\models\openchat-3.5-0106.Q5_K_M.gguf" -prompt "Hello world!"
+
+.EXAMPLE
+.\count_tokens.ps1 -model "C:\models\openchat-3.5-0106.Q5_K_M.gguf" -prompt "Hello world!" -debug
+#>
+
+Param (
+
+    [Parameter(
+        HelpMessage="The path to the GGUF model file.",
+        Mandatory=$true
+    )]
+    [String]
+    $model,
+
+    [Parameter(
+        HelpMessage="The path to the prompt text file."
+    )]
+    [String]
+    $file,
+
+    [Parameter(
+        HelpMessage="The prompt input."
+    )]
+    [String]
+    $prompt
+)
+
+if ((!$file -and !$prompt) -or ($file -and $prompt)) {
+    throw "One prompt text to tokenize is required: Either specify the -file or the -prompt parameter."
+}
+
+$debug = $PSCmdlet.MyInvocation.BoundParameters["Debug"].IsPresent -eq $true
+$verbose = $PSCmdlet.MyInvocation.BoundParameters["Verbose"].IsPresent -eq $true
+
+# We are resolving the absolute path to the llama.cpp project directory.
+$llamaCppPath = Resolve-Path -Path "${PSScriptRoot}\..\vendor\llama.cpp"
+
+$modelPath = Resolve-Path -Path "${model}"
+
+if ($file) {
+    $filePath = Resolve-Path -Path "${file}"
+}
+
+if ($debug) {
+
+    # For debugging purposes we are logging the default output of the tokenization.
+    Invoke-Expression "${llamaCppPath}\build\bin\Release\tokenize.exe ``
+        $(if ($modelPath) {"--model '${modelPath}'"}) ``
+        $(if ($filePath) {"--file '${filePath}'"} else {"--prompt '${prompt}'"})"
+}
+
+# We are only interested in the numerical token IDs array format like [1, 2, 3].
+$tokensPythonArrayString = Invoke-Expression "${llamaCppPath}\build\bin\Release\tokenize.exe ``
+    --log-disable ``
+    --ids ``
+    $(if ($modelPath) {"--model '${modelPath}'"}) ``
+    $(if ($filePath) {"--file '${filePath}'"} else {"--prompt '${prompt}'"})"
+
+# We are converting the Python array string into an PowerShell array.
+$tokens = "${tokensPythonArrayString}".Trim('[', ']') -split ',' | % { [int]$_ }
+
+Write-Host $tokens.Length
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit cd93a28cb1446319af5e2f4b416174c3a8e43546
+Subproject commit c429b33beb35f13934a4dfbe0c138d30b45e5d54