11diff --git a/common/common.cpp b/common/common.cpp
2- index 2597ba0..e42ae73 100644
2+ index ec181c6..9ba699b 100644
33--- a/common/common.cpp
44+++ b/common/common.cpp
5- @@ -1268 ,3 +1268,218 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
5+ @@ -1345 ,3 +1345,222 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
66 fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
77 fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
88 }
99+
10- + gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base) {
10+ + gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base, float lora_scale ) {
1111+ gpt_params* lparams = new gpt_params;
1212+ fprintf(stderr, "%s: loading model %s\n", __func__, fname.c_str());
1313+
1414+ // Initialize the 'model' member with the 'fname' parameter
1515+ lparams->model = fname;
1616+ lparams->lora_base = lora_base;
17- + lparams->lora_adapter = lora;
17+ + if (lora_scale == 0 && !lora_base.empty()) {
18+ + lora_scale = 1.0f;
19+ + }
20+ + if (!lora.empty()) {
21+ + lparams->lora_adapter.push_back(std::make_tuple(lora, lora_scale));
22+ + }
1823+ if (lparams->lora_adapter.empty()) {
1924+ lparams->use_mmap = false;
2025+ }
@@ -30,14 +35,14 @@ index 2597ba0..e42ae73 100644
3035+ return lparams;
3136+ }
3237+
33- + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity ) {
38+ + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all ) {
3439+ // load the model
3540+ gpt_params * lparams;
3641+ // Temporary workaround for https://github.com/go-skynet/go-llama.cpp/issues/218
3742+ #ifdef GGML_USE_CUBLAS
3843+ lparams = create_gpt_params_cuda(fname);
3944+ #else
40- + lparams = create_gpt_params(fname, lora, lora_base);
45+ + lparams = create_gpt_params(fname, lora, lora_base, lora_scale );
4146+ #endif
4247+ llama_model * model;
4348+ llama_binding_state * state;
@@ -49,10 +54,8 @@ index 2597ba0..e42ae73 100644
4954+ lparams->embedding = embeddings;
5055+ lparams->use_mlock = mlock;
5156+ lparams->n_gpu_layers = n_gpu_layers;
52- + lparams->perplexity = perplexity ;
57+ + lparams->logits_all = logits_all ;
5358+ lparams->use_mmap = mmap;
54- +
55- + lparams->low_vram = low_vram;
5659+ if (rope_freq_base != 0.0f) {
5760+ lparams->rope_freq_base = rope_freq_base;
5861+ } else {
@@ -114,8 +117,9 @@ index 2597ba0..e42ae73 100644
114117+ int idx) {
115118+
116119+ struct gpt_params params = *g_params;
120+ +
117121+ const int n_ctx = llama_n_ctx(ctx);
118- + const int n_vocab = llama_n_vocab(ctx);
122+ + const int n_vocab = llama_n_vocab(llama_get_model( ctx) );
119123+
120124+ const float temp = params.temp;
121125+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
@@ -133,7 +137,7 @@ index 2597ba0..e42ae73 100644
133137+
134138+ llama_token id = 0;
135139+
136- + float * logits = llama_get_logits (ctx) + idx * n_vocab ;
140+ + float * logits = llama_get_logits_ith (ctx, idx) ;
137141+
138142+ // Apply params.logit_bias map
139143+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@@ -184,19 +188,19 @@ index 2597ba0..e42ae73 100644
184188+ if (mirostat == 1) {
185189+ static float mirostat_mu = 2.0f * mirostat_tau;
186190+ const int mirostat_m = 100;
187- + llama_sample_temperature (ctx, &cur_p, temp);
191+ + llama_sample_temp (ctx, &cur_p, temp);
188192+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
189193+ } else if (mirostat == 2) {
190194+ static float mirostat_mu = 2.0f * mirostat_tau;
191- + llama_sample_temperature (ctx, &cur_p, temp);
195+ + llama_sample_temp (ctx, &cur_p, temp);
192196+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
193197+ } else {
194198+ // Temperature sampling
195199+ llama_sample_top_k (ctx, &cur_p, top_k, 1);
196200+ llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
197201+ llama_sample_typical (ctx, &cur_p, typical_p, 1);
198202+ llama_sample_top_p (ctx, &cur_p, top_p, 1);
199- + llama_sample_temperature (ctx, &cur_p, temp);
203+ + llama_sample_temp (ctx, &cur_p, temp);
200204+
201205+ {
202206+ const int n_top = 10;
@@ -223,10 +227,10 @@ index 2597ba0..e42ae73 100644
223227+ }
224228\ No newline at end of file
225229diff --git a/common/common.h b/common/common.h
226- index 18aea38..ca7a168 100644
230+ index 0e2d3fa..9992d2b 100644
227231--- a/common/common.h
228232+++ b/common/common.h
229- @@ -209 ,3 +209 ,19 @@ std::string get_sortable_timestamp();
233+ @@ -221 ,3 +221 ,19 @@ std::string get_sortable_timestamp();
230234 void dump_non_result_info_yaml(
231235 FILE * stream, const gpt_params & params, const llama_context * lctx,
232236 const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
@@ -236,7 +240,7 @@ index 18aea38..ca7a168 100644
236240+ llama_model * model;
237241+ };
238242+
239- + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity );
243+ + void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all );
240244+
241245+ llama_token llama_sample_token_binding(
242246+ struct llama_context * ctx,
0 commit comments