@@ -144,7 +144,7 @@ class llm_graph_input_pos_bucket : public llm_graph_input_i {
144
144
145
145
ggml_tensor * pos_bucket = nullptr ; // I32 [n_batch, n_batch]
146
146
147
- const llama_hparams & hparams;
147
+ const llama_hparams hparams;
148
148
};
149
149
150
150
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +158,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
158
158
159
159
ggml_tensor * pos_bucket = nullptr ; // I32 [n_kv, n_batch]
160
160
161
- const llama_hparams & hparams;
161
+ const llama_hparams hparams;
162
162
163
163
const llama_kv_cache_unified_context * mctx;
164
164
};
@@ -177,8 +177,8 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
177
177
178
178
ggml_tensor * out_ids; // I32 [n_outputs]
179
179
180
- const llama_hparams & hparams;
181
- const llama_cparams & cparams;
180
+ const llama_hparams hparams;
181
+ const llama_cparams cparams;
182
182
183
183
const uint32_t n_outputs;
184
184
};
@@ -192,7 +192,7 @@ class llm_graph_input_mean : public llm_graph_input_i {
192
192
193
193
ggml_tensor * mean; // F32 [n_batch, n_batch]
194
194
195
- const llama_cparams & cparams;
195
+ const llama_cparams cparams;
196
196
};
197
197
198
198
class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +204,7 @@ class llm_graph_input_cls : public llm_graph_input_i {
204
204
205
205
ggml_tensor * cls; // I32 [n_batch]
206
206
207
- const llama_cparams & cparams;
207
+ const llama_cparams cparams;
208
208
};
209
209
210
210
class llm_graph_input_rs : public llm_graph_input_i {
@@ -247,8 +247,8 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
247
247
ggml_tensor * kq_mask = nullptr ; // F32 [n_tokens, n_batch, 1, 1]
248
248
ggml_tensor * kq_mask_cnv = nullptr ; // [n_tokens, n_batch, 1, 1]
249
249
250
- const llama_hparams & hparams;
251
- const llama_cparams & cparams;
250
+ const llama_hparams hparams;
251
+ const llama_cparams cparams;
252
252
};
253
253
254
254
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +278,11 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
278
278
ggml_tensor * self_kq_mask = nullptr ; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279
279
ggml_tensor * self_kq_mask_cnv = nullptr ; // [n_kv, n_batch/n_stream, 1, n_stream]
280
280
281
- const llama_hparams & hparams;
282
- const llama_cparams & cparams;
281
+ // note: these have to be copies because in order to be able to reuse a graph, its inputs
282
+ // need to carry these parameters with them. otherwise, they can point to freed
283
+ // llm_graph_params from a previous batch, causing stack-use-after-return
284
+ const llama_hparams hparams;
285
+ const llama_cparams cparams;
283
286
284
287
const llama_kv_cache_unified_context * mctx;
285
288
};
@@ -318,8 +321,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
318
321
ggml_tensor * self_kq_mask_swa = nullptr ; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319
322
ggml_tensor * self_kq_mask_swa_cnv = nullptr ; // [n_kv, n_batch/n_stream, 1, n_stream]
320
323
321
- const llama_hparams & hparams;
322
- const llama_cparams & cparams;
324
+ const llama_hparams hparams;
325
+ const llama_cparams cparams;
323
326
324
327
const llama_kv_cache_unified_iswa_context * mctx;
325
328
};
0 commit comments