From 5bc774d1f8e054d9590be3edafda1a18443690be Mon Sep 17 00:00:00 2001 From: sigridjineth Date: Sat, 6 Sep 2025 14:36:14 +0900 Subject: [PATCH 1/3] feat: colbert_vecs --- BGEM3TFModel.py | 216 ++++++++++++++++++---------------- BGEM3WeightConverter.py | 91 ++++++++++++-- model_conversion_validator.py | 80 ++++++++++--- tf_colbert_loader.py | 101 ++++++++++++++++ torch_tf_validator.py | 67 ++++++++--- 5 files changed, 412 insertions(+), 143 deletions(-) create mode 100644 tf_colbert_loader.py diff --git a/BGEM3TFModel.py b/BGEM3TFModel.py index 14e8fc9..48eef84 100644 --- a/BGEM3TFModel.py +++ b/BGEM3TFModel.py @@ -18,25 +18,27 @@ def __init__(self, d_model, num_heads, dropout_rate=0.1, **kwargs): self.d_model = d_model self.depth = d_model // num_heads # 각 헤드의 차원 크기 - # Query, Key, Value를 위한 Dense Layer - self.wq = tf.keras.layers.Dense(d_model) - self.wk = tf.keras.layers.Dense(d_model) - self.wv = tf.keras.layers.Dense(d_model) + # Query, Key, Value를 위한 Dense Layer (stable names for SavedModel) + self.wq = tf.keras.layers.Dense(d_model, name="attention_wq") + self.wk = tf.keras.layers.Dense(d_model, name="attention_wk") + self.wv = tf.keras.layers.Dense(d_model, name="attention_wv") # 출력 레이어 - self.dense = tf.keras.layers.Dense(d_model) + self.dense = tf.keras.layers.Dense(d_model, name="attention_output") # 어텐션 layerNorm - self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5) + self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="attn_LayerNorm") # 드롭아웃 self.dropout = tf.keras.layers.Dropout(dropout_rate) - def stable_softmax(self, logits, axis=None, name=None): - """ - Stable softmax implementation - """ - return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name) + def stable_softmax(self, logits, axis=-1, name=None): + """Numerically stable softmax: subtract max and compute in float32.""" + dtype = logits.dtype + x = tf.cast(logits, tf.float32) + x = x - tf.reduce_max(x, axis=axis, keepdims=True) + probs = tf.nn.softmax(x, axis=axis, name=name) + return tf.cast(probs, dtype) def split_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) @@ -45,27 +47,29 @@ def split_heads(self, x, batch_size): def call(self, inputs, mask=None, training=False): batch_size = tf.shape(inputs)[0] - # Query, Key, Value를 계산 - q = self.wq(inputs) # (batch_size, seq_len, d_model) - k = self.wk(inputs) # (batch_size, seq_len, d_model) - v = self.wv(inputs) # (batch_size, seq_len, d_model) + # Projections + q = self.wq(inputs) + k = self.wk(inputs) + v = self.wv(inputs) - # 다중 헤드로 분리 - q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth) - k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth) - v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth) + # Split heads + q = self.split_heads(q, batch_size) + k = self.split_heads(k, batch_size) + v = self.split_heads(v, batch_size) - # Scaled Dot-Product Attention - sqrt_att_head_size = math.sqrt(self.depth) - - attention_scores = tf.matmul(q, k, transpose_b=True) # (batch_size, num_heads, seq_len_q, seq_len_k) - dk = tf.cast(sqrt_att_head_size, tf.float32) - attention_scores = tf.divide(attention_scores, dk) + # Scaled dot-product attention (compute in float32 for stability) + q_f = tf.cast(q, tf.float32) + k_f = tf.cast(k, tf.float32) + attention_scores = tf.matmul(q_f, k_f, transpose_b=True) + scale = tf.sqrt(tf.cast(self.depth, tf.float32)) + attention_scores = attention_scores / scale if mask is not None: - attention_scores = tf.add(attention_scores, mask) + attention_scores = attention_scores + tf.cast(mask, tf.float32) attention_probs = self.stable_softmax(attention_scores, axis=-1) + # Cast back to v dtype for matmul efficiency under mixed precision + attention_probs = tf.cast(attention_probs, v.dtype) attention_probs = self.dropout(attention_probs, training=training) # Attention result @@ -92,7 +96,8 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True, colbert_dim=-1, batch_size=256, query_max_length=512, passage_max_length=512, return_dense=True, return_sparse=False, return_colbert_vecs=False, dropout_rate=0.1): - super().__init__(name="bge-m3-tensorflow") + # Use safe model name (no hyphen or dot) to avoid TF resource container issues + super().__init__(name="bge_m3_tensorflow") self.model_name = model_name self.normalize_embeddings = normalize_embeddings @@ -118,11 +123,23 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True, self.num_layers = self.config.num_hidden_layers self.vocab_size = self.config.vocab_size + # Optional mixed precision + if self.use_fp16: + from tensorflow.keras import mixed_precision + try: + mixed_precision.set_global_policy("mixed_float16") + except Exception: + pass + # Build components self._build_embeddings() self._build_encoder_layers() self._build_pooler() + # Handle ColBERT dim parameter + self.colbert_dim = self.d_model if not colbert_dim or colbert_dim < 1 else int(colbert_dim) self._build_colbert() + # Sparse head (optional) + self.sparse_linear = tf.keras.layers.Dense(1, name="sparse_linear") # Tokenizer self.tokenizer = AutoTokenizer.from_pretrained( @@ -193,7 +210,7 @@ def _build_encoder_layers(self): num_heads=self.num_heads, intermediate_size=self.config.intermediate_size, dropout_rate=self.dropout_rate, - name=f"encoder.layer.{i}" + name=f"encoder_layer_{i}" ) self.encoder_layers.append(layer) @@ -203,13 +220,11 @@ def _build_pooler(self): self.d_model, activation='tanh', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), - name="pooler.dense" + name="pooler_dense" ) def _build_colbert(self): - self.colbert_linear = tf.keras.layers.Dense( - units=self.d_model, - ) + self.colbert_linear = tf.keras.layers.Dense(self.colbert_dim, name="colbert_linear") def call(self, inputs, training=False, output_hidden_states=False): @@ -225,7 +240,7 @@ def call(self, inputs, training=False, output_hidden_states=False): input_shape = self.shape_list(inputs_embeds)[:-1] if token_type_ids is None: - token_type_ids = tf.fill(dims=input_shape, value=0) + token_type_ids = tf.zeros_like(input_ids, dtype=tf.int32) if position_ids is None: if input_ids is not None: @@ -248,18 +263,17 @@ def call(self, inputs, training=False, output_hidden_states=False): if training: embedding_output = self.dropout(embedding_output, training=training) - attention_mask_origin = attention_mask + # Ensure attention mask exists and is float32 for numerical stability + if attention_mask is None: + attention_mask = tf.ones_like(input_ids, dtype=tf.int32) - attention_mask_shape = self.shape_list(attention_mask) - - extended_attention_mask = tf.reshape( - attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) - ) + attention_mask_origin = attention_mask - extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) - one_cst = tf.constant(1.0, dtype=embedding_output.dtype) - ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) - extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + B = tf.shape(input_ids)[0] + L = tf.shape(input_ids)[1] + extended_attention_mask = tf.reshape(tf.cast(attention_mask, tf.float32), (B, 1, 1, L)) + # Large negative for masked positions (kept in float32) + extended_attention_mask = (1.0 - extended_attention_mask) * (-1e9) attention_mask = extended_attention_mask @@ -276,30 +290,32 @@ def call(self, inputs, training=False, output_hidden_states=False): if output_hidden_states: all_hidden_states.append(hidden_states) - # Pooling - if self.pooling_method == "mean": - pooled_output = tf.reduce_mean(hidden_states, axis=1) - else: # default: cls - pooled_output = hidden_states[:, 0, :] - - # Apply pooler if return_dense is True - if self.return_dense: - pooled_output = pooled_output - - # Normalize embeddings if specified - if self.normalize_embeddings: - pooled_output = tf.nn.l2_normalize(pooled_output, axis=-1) + # Final last_hidden_state (B, T, H) in float32 (no pooling here) + last_hidden_state = tf.cast(hidden_states, tf.float32) ## colbert_vecs - colbert_vecs = self.colbert_linear(hidden_states[:, 1:]) - colbert_vecs = colbert_vecs * tf.cast(attention_mask_origin[:, 1:][:, :, None], dtype=tf.float32) + colbert_vecs = None + if self.return_colbert_vecs: + # Compute in the native dtype (e.g., float16 under mixed precision) + colbert_in = hidden_states[:, 1:] + colbert_out = self.colbert_linear(colbert_in) + # Match mask dtype to colbert_out to avoid dtype mismatch in multiplication + m = tf.cast(attention_mask_origin[:, 1:], colbert_out.dtype)[:, :, None] + colbert_out = colbert_out * m + # Return as float32 for serving stability + colbert_vecs = tf.cast(colbert_out, tf.float32) outputs = { - "dense_vecs": pooled_output, - "colbert_vecs": colbert_vecs, - "last_hidden_state": hidden_states + "last_hidden_state": last_hidden_state } + if colbert_vecs is not None: + outputs["colbert_vecs"] = colbert_vecs + + if self.return_sparse: + token_weights = tf.nn.relu(self.sparse_linear(hidden_states)) + outputs["token_weights"] = token_weights + if output_hidden_states: outputs["hidden_states"] = all_hidden_states @@ -311,17 +327,15 @@ def __init__(self, d_model, num_heads, intermediate_size, dropout_rate=0.1, **kw super().__init__(**kwargs) self.attention = MultiHeadAttention(d_model, num_heads, dropout_rate) - self.attention_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5) - self.attention_dropout = tf.keras.layers.Dropout(dropout_rate) # Intermediate -> gelu_approx self.intermediate = tf.keras.layers.Dense( intermediate_size, - name="intermediate.dense" + name="intermediate_dense" ) - self.output_dense = tf.keras.layers.Dense(d_model, name="output.dense") + self.output_dense = tf.keras.layers.Dense(d_model, name="output_dense") self.output_dropout = tf.keras.layers.Dropout(dropout_rate) - self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5) + self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="output_LayerNorm") def gelu_approx(self, x): x = tf.convert_to_tensor(x) @@ -350,53 +364,57 @@ def call(self, x, attention_mask=None, training=False): return output -def save_model_with_tokenizer(model, tokenizer, save_path): - """Save both model and tokenizer""" +def save_model_with_tokenizer(model: "BGEM3TensorFlow", tokenizer, save_path: str): + """Export SavedModel with a single clean default signature. + + inputs : int64 (input_ids, attention_mask) + outputs: last_hidden_state (B,T,H,float32), optional colbert_vecs (B,T-1,H,float32) + """ os.makedirs(save_path, exist_ok=True) model_save_path = os.path.join(save_path, 'model') - - # Ensure model is built by calling it with dummy inputs - dummy_inputs = { - 'input_ids': tf.zeros((2, 11), dtype=tf.int32), - 'attention_mask': tf.ones((2, 11), dtype=tf.int32) + # Clean previous export to avoid stale graph/variable metadata + try: + import shutil + if os.path.exists(model_save_path): + shutil.rmtree(model_save_path) + except Exception: + pass + + # Build variables once + dummy = { + 'input_ids': tf.zeros((2, 8), dtype=tf.int32), + 'attention_mask': tf.ones((2, 8), dtype=tf.int32), + 'token_type_ids': tf.zeros((2, 8), dtype=tf.int32), } - _ = model(dummy_inputs, training=False, output_hidden_states=True) + _ = model(dummy, training=False, output_hidden_states=False) - # Define serving signature @tf.function(input_signature=[ - tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='input_ids'), - tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='attention_mask') + tf.TensorSpec([None, None], tf.int64, name='input_ids'), + tf.TensorSpec([None, None], tf.int64, name='attention_mask'), ]) - def serving_fn(input_ids, attention_mask): + def serving_default(input_ids, attention_mask): + # Cast to int32, synthesize token_type_ids + ii = tf.cast(input_ids, tf.int32) + am = tf.cast(attention_mask, tf.int32) + tt = tf.zeros_like(ii) + + outs = model({'input_ids': ii, 'attention_mask': am, 'token_type_ids': tt}, + training=False, output_hidden_states=False) - print(input_ids) - inputs = { - 'input_ids': input_ids, - 'attention_mask': attention_mask + ret = { + 'last_hidden_state': tf.cast(outs['last_hidden_state'], tf.float32) } + if 'colbert_vecs' in outs: + ret['colbert_vecs'] = tf.cast(outs['colbert_vecs'], tf.float32) + return ret - outputs = model(inputs=inputs, training=False, output_hidden_states=True) - - if outputs.get('hidden_states'): - hidden_states = tf.stack(outputs['hidden_states'], axis=0) - return { - 'dense_vecs': outputs['dense_vecs'], # CLS Token - 'colbert_vecs': outputs['colbert_vecs'], - 'hidden_states': hidden_states # (num_layers, batch, seq_len, hidden_dim) - } - else: - return { - 'dense_vecs': outputs['dense_vecs'], - } - - # Save model + # Save the Keras model itself with a single default signature tf.saved_model.save( model, model_save_path, - signatures={'serving_default': serving_fn} + signatures={'serving_default': serving_default} ) - # Save tokenizer tokenizer.save_pretrained(save_path) return model_save_path diff --git a/BGEM3WeightConverter.py b/BGEM3WeightConverter.py index e6a26c2..8164086 100644 --- a/BGEM3WeightConverter.py +++ b/BGEM3WeightConverter.py @@ -16,7 +16,8 @@ def load_sparse_weights(): raise FileNotFoundError(f"FileNotFoundError: {model_path}") device = 'cuda' if torch.cuda.is_available() else 'cpu' - return torch.load(model_path, map_location=device, weights_only=True) + # Avoid weights_only for broader PyTorch compatibility + return torch.load(model_path, map_location=device) def load_colbert_weights(): @@ -55,11 +56,58 @@ def _init_colbert_weights(tf_model): colbert = load_colbert_weights() colbert_weights = colbert['weight'] colbert_bias = colbert['bias'] + # Convert to numpy and report shape + w = colbert_weights.detach().cpu().numpy() if hasattr(colbert_weights, "detach") else np.array(colbert_weights) + b = colbert_bias.detach().cpu().numpy() if hasattr(colbert_bias, "detach") else np.array(colbert_bias) - tf_model.colbert_linear.set_weights([ - colbert_weights.numpy().T, - colbert_bias.numpy() - ]) + out_dim, in_dim = w.shape # PT: (out_dim, in_dim) + print(f"ColBERT head weight shape: (out_dim={out_dim}, in_dim={in_dim})") + + # Ensure the Dense layer has matching units and is built + try: + current_units = getattr(tf_model.colbert_linear, "units", None) + except Exception: + current_units = None + + if current_units is not None and current_units != out_dim: + # Units mismatch; warn. Ideally create the model with detected colbert_dim to avoid this. + print(f"Warning: colbert_linear units ({current_units}) != detected out_dim ({out_dim}). We will attempt to set weights and may fail.") + + # Ensure variables exist. If not built yet, do a dummy call to build with correct in_dim. + if not getattr(tf_model.colbert_linear, "built", False): + dummy = tf.zeros((1, 2, in_dim), dtype=tf.float32) + _ = tf_model.colbert_linear(dummy) + + # Set weights (kernel shape: (in_dim, out_dim)) + tf_model.colbert_linear.set_weights([w.T, b]) + + +def _init_sparse_weights(tf_model): + """Initialize sparse head weights if available (optional).""" + try: + st = load_sparse_weights() + except FileNotFoundError as e: + print(str(e)) + return + + # Expect PyTorch shape: (out_dim=1, in_dim=hidden) + w_pt = st["weight"] + b_pt = st["bias"] + # Ensure numpy + if hasattr(w_pt, "cpu"): + w_np = w_pt.cpu().numpy() + else: + w_np = np.array(w_pt) + if hasattr(b_pt, "cpu"): + b_np = b_pt.cpu().numpy() + else: + b_np = np.array(b_pt) + + # Build layer if not built + in_dim = w_np.shape[1] + tf_model.sparse_linear.build((None, None, in_dim)) + # Keras Dense kernel shape: (in_dim, out_dim) + tf_model.sparse_linear.set_weights([w_np.T, b_np]) class BGEM3WeightConverter: @@ -85,15 +133,15 @@ def initialize_weights(self, tf_model): # Initialize encoder layers self._init_transformer_blocks(tf_model) - # Initialize pooler - self._init_pooler_weights(tf_model) - - # Initialize pooler + # Initialize pooler (once) self._init_pooler_weights(tf_model) # Initialize colbert _init_colbert_weights(tf_model) + # Initialize sparse head (optional) + _init_sparse_weights(tf_model) + return tf_model def _init_embedding_weights(self, tf_model): @@ -230,9 +278,28 @@ def _init_pooler_weights(self, tf_model): def convert_and_save_model(model_name: str, save_path: str): - """Convert PyTorch model to TensorFlow and save""" - # Initialize TensorFlow model - tf_model = BGEM3TensorFlow(model_name) + """Convert PyTorch model to TensorFlow and save. + Also detects and uses original ColBERT dimension for TF head. + """ + # Detect ColBERT original dimension from weights (out_dim) + try: + colbert = load_colbert_weights() + colbert_w = colbert['weight'] + out_dim = int(colbert_w.shape[0]) + print(f"Detected ColBERT dimension: {out_dim}") + colbert_dim = out_dim + return_colbert_vecs = True + except Exception as e: + print(f"ColBERT weights not found or failed to load: {e}") + colbert_dim = -1 + return_colbert_vecs = False + + # Initialize TensorFlow model with detected colbert_dim + tf_model = BGEM3TensorFlow( + model_name, + colbert_dim=colbert_dim, + return_colbert_vecs=return_colbert_vecs, + ) # Convert weights converter = BGEM3WeightConverter(model_name) diff --git a/model_conversion_validator.py b/model_conversion_validator.py index 955640f..a417943 100644 --- a/model_conversion_validator.py +++ b/model_conversion_validator.py @@ -83,16 +83,64 @@ def load_converted_tf_model(saved_model_dir): """ model_path = f"{saved_model_dir}/model" loaded_model = tf.saved_model.load(model_path) - serving_fn = loaded_model.signatures["serving_default"] + # Prefer a compatible signature if available + prefer = [ + "serving_default", + "serving_int32_3in", + "serving_int64_3in", + "serving_int32_2in", + "serving_int64_2in", + ] + sigs = loaded_model.signatures + for k in prefer: + if k in sigs: + serving_fn = sigs[k] + break + else: + raise RuntimeError("No suitable serving signature found in SavedModel.") tokenizer = AutoTokenizer.from_pretrained(saved_model_dir) return serving_fn, tokenizer +def call_signature(sig, input_ids, attention_mask, token_type_ids=None): + """ + Call SavedModel signature with automatic key/dtype adaptation. + - Supplies only required keys + - Fills missing token_type_ids with zeros + - Casts inputs to signature dtypes + """ + # structured_input_signature: (args, kwargs) + spec_kwargs = sig.structured_input_signature[1] + + def prepare(name, value): + if name not in spec_kwargs: + return None + if value is None and name == "token_type_ids": + value = tf.zeros_like(input_ids) + want = spec_kwargs[name].dtype + if hasattr(value, "dtype") and value.dtype != want: + value = tf.cast(value, want) + return value + + kwargs = {} + x = prepare("input_ids", input_ids) + if x is not None: + kwargs["input_ids"] = x + x = prepare("attention_mask", attention_mask) + if x is not None: + kwargs["attention_mask"] = x + x = prepare("token_type_ids", token_type_ids) + if x is not None: + kwargs["token_type_ids"] = x + + return sig(**kwargs) + + def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128): """ TensorFlow 모델(서빙 시그니처)로 임베딩 추출하는 함수. - BGEM3TensorFlow 구조상 "dense_vecs" 키에 최종 임베딩이 들어있다고 가정. + SavedModel은 last_hidden_state (B,T,H)만 반환하므로 CLS 풀링을 적용해 (B,H) 임베딩 생성. """ inputs = tokenizer( queries, @@ -102,11 +150,11 @@ def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128): return_tensors="tf" ) - outputs = serving_fn( - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"] - ) - embeddings = outputs["dense_vecs"].numpy() # (batch_size, hidden_size) + token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"])) + outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids) + # Serving returns last_hidden_state (B, T, H); apply CLS pooling for embedding + last_hidden = outputs["last_hidden_state"] # (B, T, H) + embeddings = last_hidden[:, 0, :].numpy() # (B, H) return embeddings @@ -125,15 +173,17 @@ def encode_with_tf_model_and_get_hidden_states(serving_fn, tokenizer, queries, m return_tensors="tf" ) - outputs = serving_fn( - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"] - ) + token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"])) + outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids) - hidden_states = outputs["hidden_states"] # (num_layers, batch, seq_len, hidden_dim) - final_embeddings = outputs["dense_vecs"] - print("outputs['colbert_vecs'] : ") - print(outputs["colbert_vecs"]) + # Only last_hidden_state is returned in serving; keep KeyError behavior for old path + hidden_states = outputs["hidden_states"] # will raise KeyError (by design) + final_embeddings = outputs["last_hidden_state"] + if "colbert_vecs" in outputs: + print("outputs['colbert_vecs'] : ") + print(outputs["colbert_vecs"]) + else: + print("colbert_vecs not returned by TF model (flag disabled).") return final_embeddings.numpy(), hidden_states diff --git a/tf_colbert_loader.py b/tf_colbert_loader.py new file mode 100644 index 0000000..8005891 --- /dev/null +++ b/tf_colbert_loader.py @@ -0,0 +1,101 @@ +import os +import argparse +import tensorflow as tf +from transformers import AutoTokenizer + + +def resolve_model_path(root_dir: str) -> str: + """Return a path that contains saved_model.pb (root or /model).""" + cand1 = root_dir + cand2 = os.path.join(root_dir, "model") + if os.path.exists(os.path.join(cand2, "saved_model.pb")): + return cand2 + if os.path.exists(os.path.join(cand1, "saved_model.pb")): + return cand1 + raise FileNotFoundError(f"No SavedModel found under '{root_dir}' (checked '{cand2}' and '{cand1}')") + + +def call_signature(sig, input_ids, attention_mask, token_type_ids=None): + """Call signature with best-effort arg set and dtype handling. + Tries (int64,int32) x (3-key,2-key) in order. + """ + last_err = None + for dtype in (tf.int64, tf.int32): + ii = tf.cast(input_ids, dtype) + am = tf.cast(attention_mask, dtype) + # Try 3-key first + if token_type_ids is not None: + try: + tt = tf.cast(token_type_ids, dtype) + return sig(input_ids=ii, attention_mask=am, token_type_ids=tt) + except Exception as e: + last_err = e + # Then 2-key + try: + return sig(input_ids=ii, attention_mask=am) + except Exception as e: + last_err = e + continue + if last_err is not None: + raise last_err + raise RuntimeError("Failed to call signature with any supported argument pattern") + + +def main(): + parser = argparse.ArgumentParser(description="Load TF ColBERT SavedModel and print output shapes.") + parser.add_argument("--model_dir", default="./converted_bge_m3", help="Path to SavedModel root (contains tokenizer files).") + parser.add_argument("--max_length", type=int, default=128, help="Tokenization max length.") + parser.add_argument("--texts", nargs="*", default=[ + "이 모델은 무엇을 하는 모델인가요?", + "bge-m3 tensorflow colbert vectors test", + ]) + args = parser.parse_args() + + # Load tokenizer from the same root dir + tok = AutoTokenizer.from_pretrained(args.model_dir) + inputs = tok( + args.texts, + padding=True, + truncation=True, + max_length=args.max_length, + return_tensors="tf", + ) + # token_type_ids may be missing for XLM-R; create zeros if absent + if "token_type_ids" not in inputs: + inputs["token_type_ids"] = tf.zeros_like(inputs["input_ids"], dtype=inputs["input_ids"].dtype) + + # Load SavedModel signature + model_path = resolve_model_path(args.model_dir) + loaded = tf.saved_model.load(model_path) + sig = loaded.signatures.get("serving_default") + if sig is None: + raise RuntimeError("serving_default signature not found") + + # Call signature robustly + outs = None + try: + outs = call_signature(sig, inputs["input_ids"], inputs["attention_mask"], inputs.get("token_type_ids")) + except TypeError: + # Try without token_type_ids + outs = call_signature(sig, inputs["input_ids"], inputs["attention_mask"], None) + + # Print keys and shapes + print("Signature outputs:") + for k, v in outs.items(): + try: + print(f"- {k}: shape={tuple(v.shape)}, dtype={v.dtype}") + except Exception: + print(f"- {k}: ") + + # Convenience checks + if "last_hidden_state" in outs: + print("last_hidden_state OK ->", tuple(outs["last_hidden_state"].shape)) + if "colbert_vecs" in outs: + print("colbert_vecs OK ->", tuple(outs["colbert_vecs"].shape)) + else: + print("colbert_vecs not present in signature outputs.") + # hidden_states is no longer returned in serving by design + + +if __name__ == "__main__": + main() diff --git a/torch_tf_validator.py b/torch_tf_validator.py index 0efe3ce..b745ab4 100644 --- a/torch_tf_validator.py +++ b/torch_tf_validator.py @@ -28,6 +28,33 @@ def tokenize_w_padding(tokenizer, text, return_tensors="pt", max_length=512): return tokenizer(text, padding="max_length", max_length=max_length, return_tensors=return_tensors) +def call_signature(sig, input_ids, attention_mask, token_type_ids=None): + """Adapt arguments to a SavedModel signature: keys and dtypes.""" + spec_kwargs = sig.structured_input_signature[1] + + def prepare(name, value): + if name not in spec_kwargs: + return None + if value is None and name == "token_type_ids": + value = tf.zeros_like(input_ids) + want = spec_kwargs[name].dtype + if hasattr(value, "dtype") and value.dtype != want: + value = tf.cast(value, want) + return value + + kwargs = {} + x = prepare("input_ids", input_ids) + if x is not None: + kwargs["input_ids"] = x + x = prepare("attention_mask", attention_mask) + if x is not None: + kwargs["attention_mask"] = x + x = prepare("token_type_ids", token_type_ids) + if x is not None: + kwargs["token_type_ids"] = x + return sig(**kwargs) + + def main(): # Load the model model_path = "BAAI/bge-m3" @@ -61,34 +88,40 @@ def main(): inputs_tf = tokenize_wo_padding(tokenizer, text, return_tensors="tf") inputs_tf_w_padding = tokenize_w_padding(tokenizer, text, return_tensors="tf") - inputs_tf_w_padding_attnFixed = inputs_tf_w_padding.copy() - inputs_tf_w_padding_attnFixed['attention_mask'] = tf.where(inputs_tf_w_padding['attention_mask'] == 0, -9999999, 0) - tf_model = load_tf_model(model_path_tf).signatures["serving_default"] + loaded = load_tf_model(model_path_tf) + # Use the default 2-input signature + sigs = loaded.signatures + tf_model = sigs.get("serving_default") + if tf_model is None: + raise RuntimeError("serving_default signature not found") loguru.logger.info("Tensorflow] Model output".ljust(50, "-")) with tf.device("/GPU:0"): - output_tf = tf_model(**inputs_tf) - output_tf_w_padding = tf_model(**inputs_tf_w_padding) - output_tf_w_padding_attnFixed = tf_model(**inputs_tf_w_padding_attnFixed) + output_tf = call_signature( + tf_model, inputs_tf["input_ids"], inputs_tf["attention_mask"], None + ) + output_tf_w_padding = call_signature( + tf_model, + inputs_tf_w_padding["input_ids"], + inputs_tf_w_padding["attention_mask"], + None, + ) loguru.logger.info("output without padding (GT)".ljust(50, "-")) - loguru.logger.info(output_tf['hidden_states'][-1][:,0]) + hs = output_tf['last_hidden_state'] + val_no_pad = hs[:, 0] + loguru.logger.info(val_no_pad) loguru.logger.info("="*50) loguru.logger.info("output with padding".ljust(50, "-")) - loguru.logger.info(output_tf_w_padding['hidden_states'][-1][:,0]) + hsw = output_tf_w_padding['last_hidden_state'] + val_pad = hsw[:, 0] + loguru.logger.info(val_pad) loguru.logger.info("="*50) - loguru.logger.info("output with padding (attention fixed)".ljust(50, "-")) - loguru.logger.info(output_tf_w_padding_attnFixed['hidden_states'][-1][:,0]) - loguru.logger.info("="*50) - err_tf = tf.abs(output_tf['hidden_states'][-1][:,0] - output_tf_w_padding['hidden_states'][-1][:,0]) + err_tf = tf.abs(val_no_pad - val_pad) loguru.logger.info("Error".ljust(50, "-")) loguru.logger.info(tf.reduce_mean(err_tf)) loguru.logger.info("="*50) - err_tf_attnFixed = tf.abs(output_tf_w_padding['hidden_states'][-1][:,0] - output_tf_w_padding_attnFixed['hidden_states'][-1][:,0]) - loguru.logger.info("Error (attention fixed)".ljust(50, "-")) - loguru.logger.info(tf.reduce_mean(err_tf_attnFixed)) - loguru.logger.info("="*50) if __name__ == "__main__": - main() \ No newline at end of file + main() From 2496b49e1750dd1c3fc6f37e4f6a884c86601c13 Mon Sep 17 00:00:00 2001 From: sigridjineth Date: Sun, 7 Sep 2025 18:13:11 +0900 Subject: [PATCH 2/3] fix --- BGEM3TFModel.py | 216 ++++++++-------- BGEM3TFModel_tfkeras2.py | 246 ++++++++++++++++++ BGEM3WeightConverter.py | 91 +------ export_tf1_saved_model.py | 369 +++++++++++++++++++++++++++ model_conversion_validator.py | 465 +++++++++------------------------- tf1_session_validator.py | 39 +++ tf1_validator.py | 165 ++++++++++++ tf_colbert_loader.py | 2 +- torch_tf_validator.py | 198 ++++++--------- 9 files changed, 1128 insertions(+), 663 deletions(-) create mode 100644 BGEM3TFModel_tfkeras2.py create mode 100644 export_tf1_saved_model.py create mode 100644 tf1_session_validator.py create mode 100644 tf1_validator.py diff --git a/BGEM3TFModel.py b/BGEM3TFModel.py index 48eef84..14e8fc9 100644 --- a/BGEM3TFModel.py +++ b/BGEM3TFModel.py @@ -18,27 +18,25 @@ def __init__(self, d_model, num_heads, dropout_rate=0.1, **kwargs): self.d_model = d_model self.depth = d_model // num_heads # 각 헤드의 차원 크기 - # Query, Key, Value를 위한 Dense Layer (stable names for SavedModel) - self.wq = tf.keras.layers.Dense(d_model, name="attention_wq") - self.wk = tf.keras.layers.Dense(d_model, name="attention_wk") - self.wv = tf.keras.layers.Dense(d_model, name="attention_wv") + # Query, Key, Value를 위한 Dense Layer + self.wq = tf.keras.layers.Dense(d_model) + self.wk = tf.keras.layers.Dense(d_model) + self.wv = tf.keras.layers.Dense(d_model) # 출력 레이어 - self.dense = tf.keras.layers.Dense(d_model, name="attention_output") + self.dense = tf.keras.layers.Dense(d_model) # 어텐션 layerNorm - self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="attn_LayerNorm") + self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5) # 드롭아웃 self.dropout = tf.keras.layers.Dropout(dropout_rate) - def stable_softmax(self, logits, axis=-1, name=None): - """Numerically stable softmax: subtract max and compute in float32.""" - dtype = logits.dtype - x = tf.cast(logits, tf.float32) - x = x - tf.reduce_max(x, axis=axis, keepdims=True) - probs = tf.nn.softmax(x, axis=axis, name=name) - return tf.cast(probs, dtype) + def stable_softmax(self, logits, axis=None, name=None): + """ + Stable softmax implementation + """ + return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name) def split_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) @@ -47,29 +45,27 @@ def split_heads(self, x, batch_size): def call(self, inputs, mask=None, training=False): batch_size = tf.shape(inputs)[0] - # Projections - q = self.wq(inputs) - k = self.wk(inputs) - v = self.wv(inputs) + # Query, Key, Value를 계산 + q = self.wq(inputs) # (batch_size, seq_len, d_model) + k = self.wk(inputs) # (batch_size, seq_len, d_model) + v = self.wv(inputs) # (batch_size, seq_len, d_model) - # Split heads - q = self.split_heads(q, batch_size) - k = self.split_heads(k, batch_size) - v = self.split_heads(v, batch_size) + # 다중 헤드로 분리 + q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth) + k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth) + v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth) - # Scaled dot-product attention (compute in float32 for stability) - q_f = tf.cast(q, tf.float32) - k_f = tf.cast(k, tf.float32) - attention_scores = tf.matmul(q_f, k_f, transpose_b=True) - scale = tf.sqrt(tf.cast(self.depth, tf.float32)) - attention_scores = attention_scores / scale + # Scaled Dot-Product Attention + sqrt_att_head_size = math.sqrt(self.depth) + + attention_scores = tf.matmul(q, k, transpose_b=True) # (batch_size, num_heads, seq_len_q, seq_len_k) + dk = tf.cast(sqrt_att_head_size, tf.float32) + attention_scores = tf.divide(attention_scores, dk) if mask is not None: - attention_scores = attention_scores + tf.cast(mask, tf.float32) + attention_scores = tf.add(attention_scores, mask) attention_probs = self.stable_softmax(attention_scores, axis=-1) - # Cast back to v dtype for matmul efficiency under mixed precision - attention_probs = tf.cast(attention_probs, v.dtype) attention_probs = self.dropout(attention_probs, training=training) # Attention result @@ -96,8 +92,7 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True, colbert_dim=-1, batch_size=256, query_max_length=512, passage_max_length=512, return_dense=True, return_sparse=False, return_colbert_vecs=False, dropout_rate=0.1): - # Use safe model name (no hyphen or dot) to avoid TF resource container issues - super().__init__(name="bge_m3_tensorflow") + super().__init__(name="bge-m3-tensorflow") self.model_name = model_name self.normalize_embeddings = normalize_embeddings @@ -123,23 +118,11 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True, self.num_layers = self.config.num_hidden_layers self.vocab_size = self.config.vocab_size - # Optional mixed precision - if self.use_fp16: - from tensorflow.keras import mixed_precision - try: - mixed_precision.set_global_policy("mixed_float16") - except Exception: - pass - # Build components self._build_embeddings() self._build_encoder_layers() self._build_pooler() - # Handle ColBERT dim parameter - self.colbert_dim = self.d_model if not colbert_dim or colbert_dim < 1 else int(colbert_dim) self._build_colbert() - # Sparse head (optional) - self.sparse_linear = tf.keras.layers.Dense(1, name="sparse_linear") # Tokenizer self.tokenizer = AutoTokenizer.from_pretrained( @@ -210,7 +193,7 @@ def _build_encoder_layers(self): num_heads=self.num_heads, intermediate_size=self.config.intermediate_size, dropout_rate=self.dropout_rate, - name=f"encoder_layer_{i}" + name=f"encoder.layer.{i}" ) self.encoder_layers.append(layer) @@ -220,11 +203,13 @@ def _build_pooler(self): self.d_model, activation='tanh', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), - name="pooler_dense" + name="pooler.dense" ) def _build_colbert(self): - self.colbert_linear = tf.keras.layers.Dense(self.colbert_dim, name="colbert_linear") + self.colbert_linear = tf.keras.layers.Dense( + units=self.d_model, + ) def call(self, inputs, training=False, output_hidden_states=False): @@ -240,7 +225,7 @@ def call(self, inputs, training=False, output_hidden_states=False): input_shape = self.shape_list(inputs_embeds)[:-1] if token_type_ids is None: - token_type_ids = tf.zeros_like(input_ids, dtype=tf.int32) + token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: if input_ids is not None: @@ -263,17 +248,18 @@ def call(self, inputs, training=False, output_hidden_states=False): if training: embedding_output = self.dropout(embedding_output, training=training) - # Ensure attention mask exists and is float32 for numerical stability - if attention_mask is None: - attention_mask = tf.ones_like(input_ids, dtype=tf.int32) - attention_mask_origin = attention_mask - B = tf.shape(input_ids)[0] - L = tf.shape(input_ids)[1] - extended_attention_mask = tf.reshape(tf.cast(attention_mask, tf.float32), (B, 1, 1, L)) - # Large negative for masked positions (kept in float32) - extended_attention_mask = (1.0 - extended_attention_mask) * (-1e9) + attention_mask_shape = self.shape_list(attention_mask) + + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) attention_mask = extended_attention_mask @@ -290,32 +276,30 @@ def call(self, inputs, training=False, output_hidden_states=False): if output_hidden_states: all_hidden_states.append(hidden_states) - # Final last_hidden_state (B, T, H) in float32 (no pooling here) - last_hidden_state = tf.cast(hidden_states, tf.float32) + # Pooling + if self.pooling_method == "mean": + pooled_output = tf.reduce_mean(hidden_states, axis=1) + else: # default: cls + pooled_output = hidden_states[:, 0, :] + + # Apply pooler if return_dense is True + if self.return_dense: + pooled_output = pooled_output + + # Normalize embeddings if specified + if self.normalize_embeddings: + pooled_output = tf.nn.l2_normalize(pooled_output, axis=-1) ## colbert_vecs - colbert_vecs = None - if self.return_colbert_vecs: - # Compute in the native dtype (e.g., float16 under mixed precision) - colbert_in = hidden_states[:, 1:] - colbert_out = self.colbert_linear(colbert_in) - # Match mask dtype to colbert_out to avoid dtype mismatch in multiplication - m = tf.cast(attention_mask_origin[:, 1:], colbert_out.dtype)[:, :, None] - colbert_out = colbert_out * m - # Return as float32 for serving stability - colbert_vecs = tf.cast(colbert_out, tf.float32) + colbert_vecs = self.colbert_linear(hidden_states[:, 1:]) + colbert_vecs = colbert_vecs * tf.cast(attention_mask_origin[:, 1:][:, :, None], dtype=tf.float32) outputs = { - "last_hidden_state": last_hidden_state + "dense_vecs": pooled_output, + "colbert_vecs": colbert_vecs, + "last_hidden_state": hidden_states } - if colbert_vecs is not None: - outputs["colbert_vecs"] = colbert_vecs - - if self.return_sparse: - token_weights = tf.nn.relu(self.sparse_linear(hidden_states)) - outputs["token_weights"] = token_weights - if output_hidden_states: outputs["hidden_states"] = all_hidden_states @@ -327,15 +311,17 @@ def __init__(self, d_model, num_heads, intermediate_size, dropout_rate=0.1, **kw super().__init__(**kwargs) self.attention = MultiHeadAttention(d_model, num_heads, dropout_rate) + self.attention_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5) + self.attention_dropout = tf.keras.layers.Dropout(dropout_rate) # Intermediate -> gelu_approx self.intermediate = tf.keras.layers.Dense( intermediate_size, - name="intermediate_dense" + name="intermediate.dense" ) - self.output_dense = tf.keras.layers.Dense(d_model, name="output_dense") + self.output_dense = tf.keras.layers.Dense(d_model, name="output.dense") self.output_dropout = tf.keras.layers.Dropout(dropout_rate) - self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="output_LayerNorm") + self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5) def gelu_approx(self, x): x = tf.convert_to_tensor(x) @@ -364,57 +350,53 @@ def call(self, x, attention_mask=None, training=False): return output -def save_model_with_tokenizer(model: "BGEM3TensorFlow", tokenizer, save_path: str): - """Export SavedModel with a single clean default signature. - - inputs : int64 (input_ids, attention_mask) - outputs: last_hidden_state (B,T,H,float32), optional colbert_vecs (B,T-1,H,float32) - """ +def save_model_with_tokenizer(model, tokenizer, save_path): + """Save both model and tokenizer""" os.makedirs(save_path, exist_ok=True) model_save_path = os.path.join(save_path, 'model') - # Clean previous export to avoid stale graph/variable metadata - try: - import shutil - if os.path.exists(model_save_path): - shutil.rmtree(model_save_path) - except Exception: - pass - - # Build variables once - dummy = { - 'input_ids': tf.zeros((2, 8), dtype=tf.int32), - 'attention_mask': tf.ones((2, 8), dtype=tf.int32), - 'token_type_ids': tf.zeros((2, 8), dtype=tf.int32), + + # Ensure model is built by calling it with dummy inputs + dummy_inputs = { + 'input_ids': tf.zeros((2, 11), dtype=tf.int32), + 'attention_mask': tf.ones((2, 11), dtype=tf.int32) } - _ = model(dummy, training=False, output_hidden_states=False) + _ = model(dummy_inputs, training=False, output_hidden_states=True) + # Define serving signature @tf.function(input_signature=[ - tf.TensorSpec([None, None], tf.int64, name='input_ids'), - tf.TensorSpec([None, None], tf.int64, name='attention_mask'), + tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='input_ids'), + tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='attention_mask') ]) - def serving_default(input_ids, attention_mask): - # Cast to int32, synthesize token_type_ids - ii = tf.cast(input_ids, tf.int32) - am = tf.cast(attention_mask, tf.int32) - tt = tf.zeros_like(ii) - - outs = model({'input_ids': ii, 'attention_mask': am, 'token_type_ids': tt}, - training=False, output_hidden_states=False) + def serving_fn(input_ids, attention_mask): - ret = { - 'last_hidden_state': tf.cast(outs['last_hidden_state'], tf.float32) + print(input_ids) + inputs = { + 'input_ids': input_ids, + 'attention_mask': attention_mask } - if 'colbert_vecs' in outs: - ret['colbert_vecs'] = tf.cast(outs['colbert_vecs'], tf.float32) - return ret - # Save the Keras model itself with a single default signature + outputs = model(inputs=inputs, training=False, output_hidden_states=True) + + if outputs.get('hidden_states'): + hidden_states = tf.stack(outputs['hidden_states'], axis=0) + return { + 'dense_vecs': outputs['dense_vecs'], # CLS Token + 'colbert_vecs': outputs['colbert_vecs'], + 'hidden_states': hidden_states # (num_layers, batch, seq_len, hidden_dim) + } + else: + return { + 'dense_vecs': outputs['dense_vecs'], + } + + # Save model tf.saved_model.save( model, model_save_path, - signatures={'serving_default': serving_default} + signatures={'serving_default': serving_fn} ) + # Save tokenizer tokenizer.save_pretrained(save_path) return model_save_path diff --git a/BGEM3TFModel_tfkeras2.py b/BGEM3TFModel_tfkeras2.py new file mode 100644 index 0000000..cb538b2 --- /dev/null +++ b/BGEM3TFModel_tfkeras2.py @@ -0,0 +1,246 @@ +from typing import Dict, List, Union +import numpy as np +import tensorflow as tf +from tensorflow.keras import layers, Model + +def gelu_exact(x): + # exact erf-based GELU + return tf.nn.gelu(x, approximate=False) + +# --- 맨 위에 추가 --- +class TorchLayerNorm(layers.Layer): + def __init__(self, hidden_size: int, eps: float = 1e-5, name: str = None, **kwargs): + super().__init__(name=name, **kwargs) + self.hidden_size = hidden_size + self.eps = eps + + def build(self, input_shape): + self.gamma = self.add_weight(name="gamma", shape=(self.hidden_size,), initializer="ones", dtype=tf.float32, trainable=True) + self.beta = self.add_weight(name="beta", shape=(self.hidden_size,), initializer="zeros", dtype=tf.float32, trainable=True) + super().build(input_shape) + + def call(self, x): + # PyTorch LayerNorm과 동일: 모집단 분산(mean of squares) 사용 + mean = tf.reduce_mean(x, axis=-1, keepdims=True) + var = tf.reduce_mean(tf.square(x - mean), axis=-1, keepdims=True) + xhat = (x - mean) / tf.sqrt(var + self.eps) + return xhat * self.gamma + self.beta + + +class TorchLayerNormTF1: + def __init__(self, hidden_size: int, eps: float = 1e-5, scope: str = "LayerNorm"): + self.hidden_size = hidden_size + self.eps = eps + with tf.compat.v1.variable_scope(scope, reuse=tf.compat.v1.AUTO_REUSE): + self.gamma = tf.compat.v1.get_variable( + "weight", + shape=[hidden_size], + initializer=tf.compat.v1.initializers.ones(), + dtype=tf.float32, + ) + self.beta = tf.compat.v1.get_variable( + "bias", + shape=[hidden_size], + initializer=tf.compat.v1.initializers.zeros(), + dtype=tf.float32, + ) + + def __call__(self, x): + mean = tf.reduce_mean(x, axis=-1, keepdims=True) + var = tf.reduce_mean(tf.square(x - mean), axis=-1, keepdims=True) + xhat = (x - mean) / tf.sqrt(var + self.eps) + return xhat * self.gamma + self.beta + + +class MultiHeadAttention(layers.Layer): + def __init__(self, d_model: int, num_heads: int, dropout_rate: float = 0.0, **kwargs): + super().__init__(**kwargs) + if d_model % num_heads != 0: + raise ValueError(f"d_model ({d_model}) must be divisible by num_heads ({num_heads})") + self.d_model = d_model + self.num_heads = num_heads + self.depth = d_model // num_heads + + self.wq = layers.Dense(d_model, use_bias=True, name=f"{self.name}.wq") + self.wk = layers.Dense(d_model, use_bias=True, name=f"{self.name}.wk") + self.wv = layers.Dense(d_model, use_bias=True, name=f"{self.name}.wv") + self.dense = layers.Dense(d_model, use_bias=True, name=f"{self.name}.dense") + + self.attlayerNorm = TorchLayerNormTF1(self.d_model, eps=1e-5, scope=f"{self.name}.attlayerNorm") + self.dropout = layers.Dropout(rate=dropout_rate) + + def split_heads(self, x, batch_size): + x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, inputs, mask=None, training=False): + bsz = tf.shape(inputs)[0] + q = self.split_heads(self.wq(inputs), bsz) + k = self.split_heads(self.wk(inputs), bsz) + v = self.split_heads(self.wv(inputs), bsz) + + dk = tf.cast(self.depth, tf.float32) + attn_scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(dk) + if mask is not None: + attn_scores = attn_scores + mask + + # stable softmax (+1e-9) to mirror the working code + attn_probs = tf.nn.softmax(attn_scores + 1e-9, axis=-1) + attn_probs = self.dropout(attn_probs, training=training) + + ctx = tf.matmul(attn_probs, v) + ctx = tf.transpose(ctx, perm=[0, 2, 1, 3]) + ctx = tf.reshape(ctx, (bsz, -1, self.d_model)) + + out = self.dense(ctx) + if training: + out = self.dropout(out, training=training) + out = self.attlayerNorm(out + inputs) + return out + +class TransformerBlock(layers.Layer): + def __init__(self, d_model: int, num_heads: int, intermediate_size: int, dropout_rate: float = 0.0, **kwargs): + super().__init__(**kwargs) + self.attention = MultiHeadAttention(d_model, num_heads, dropout_rate, name=f"{self.name}.attention") + self.intermediate = layers.Dense(intermediate_size, use_bias=True, name=f"{self.name}.intermediate.dense") + self.output_dense = layers.Dense(d_model, use_bias=True, name=f"{self.name}.output.dense") + self.output_dropout = layers.Dropout(dropout_rate) + self.output_norm = TorchLayerNormTF1(d_model, eps=1e-5, scope=f"{self.name}.output.LayerNorm") + + def call(self, x, attention_mask=None, training=False): + x_att = self.attention(x, mask=attention_mask, training=training) + inter = self.intermediate(x_att) + inter = gelu_exact(inter) + out = self.output_dense(inter) + if training: + out = self.output_dropout(out, training=training) + x_out = self.output_norm(out + x_att) + return x_out + +class BGEM3TensorFlow(Model): + def __init__( + self, + vocab_size: int = 250002, + max_position_embeddings: int = 8194, + type_vocab_size: int = 1, + hidden_size: int = 1024, + num_hidden_layers: int = 24, + num_attention_heads: int = 16, + intermediate_size: int = 4096, + dropout_rate: float = 0.0, + name: str = "bge-m3-tensorflow", + ): + super().__init__(name=name) + self.padding_idx = 1 + self.hidden_size = hidden_size + self.num_layers = num_hidden_layers + + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="embeddings", shape=[vocab_size, hidden_size], + initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), dtype=tf.float32 + ) + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", shape=[max_position_embeddings, hidden_size], + initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), dtype=tf.float32 + ) + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", shape=[type_vocab_size, hidden_size], + initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), dtype=tf.float32 + ) + + # 이름은 HF와 다를 수 있어도 상관없음. 가중치만 정확히 들어가면 됩니다. + self.layerNorm = TorchLayerNormTF1(hidden_size, eps=1e-5, scope="embeddings.LayerNorm") + self.dropout = layers.Dropout(rate=dropout_rate) + + self.encoder_layers: List[TransformerBlock] = [] + for i in range(num_hidden_layers): + self.encoder_layers.append( + TransformerBlock( + d_model=hidden_size, + num_heads=num_attention_heads, + intermediate_size=intermediate_size, + dropout_rate=dropout_rate, + name=f"encoder.layer.{i}", + ) + ) + + # pooler는 내보내기 시점 디버그용으로만 빌드. dense_vecs는 raw CLS 반환. + self.pooler = layers.Dense(hidden_size, activation="tanh", use_bias=True, name="pooler.dense") + self.colbert_linear = layers.Dense(units=hidden_size, use_bias=True, name="colbert_linear") + + # 경고 제거용 + def build(self, input_shape): + self.built = True + + @staticmethod + def _create_position_ids_from_attention_mask(attention_mask, padding_idx=1, past_key_values_length=0): + # attention_mask: [B,T] 0/1 + mask = tf.cast(attention_mask, tf.int32) + incremental = tf.math.cumsum(mask, axis=1) + if past_key_values_length != 0: + incremental = incremental + tf.cast(past_key_values_length, tf.int32) + # pads -> padding_idx, tokens -> cumsum + padding_idx + return incremental * mask + tf.cast(padding_idx, tf.int32) + + @staticmethod + def _shape_list(t: Union[tf.Tensor, np.ndarray]) -> List[int]: + if isinstance(t, np.ndarray): + return list(t.shape) + dynamic = tf.shape(t) + static = t.shape.as_list() + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + # === HF 로직과 완전히 동일하게 다시 작성 (형/연산 순서 포함) === + @staticmethod + def _create_position_ids_from_input_ids(input_ids, padding_idx=1, past_key_values_length=0): + # Replace non-padding symbols with their position numbers. + # Position numbers begin at padding_idx + 1. + mask = tf.cast(tf.not_equal(input_ids, padding_idx), dtype=tf.int32) + incremental = tf.math.cumsum(mask, axis=1) + if past_key_values_length != 0: + incremental = incremental + tf.cast(past_key_values_length, tf.int32) + incremental = incremental * mask + tf.cast(padding_idx, tf.int32) + return tf.cast(incremental, dtype=input_ids.dtype) + + def call(self, inputs: Dict[str, tf.Tensor], training=False, output_hidden_states: bool = False): + input_ids = tf.cast(inputs["input_ids"], tf.int32) + attention_mask = tf.cast(inputs["attention_mask"], tf.int32) + + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + pos_ids = self._create_position_ids_from_attention_mask(attention_mask, padding_idx=self.padding_idx) + pos_ids = tf.cast(pos_ids, dtype=input_ids.dtype) + pos_embeds = tf.gather(params=self.position_embeddings, indices=pos_ids) + + token_type_ids = tf.zeros_like(input_ids) + tok_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + + hidden_states = inputs_embeds + pos_embeds + tok_type_embeds + hidden_states = self.layerNorm(hidden_states) + if training: + hidden_states = self.dropout(hidden_states, training=training) + + bsz, _, _ = self._shape_list(hidden_states) + ext_mask = tf.reshape(attention_mask, (bsz, 1, 1, -1)) + ext_mask = tf.cast(ext_mask, dtype=hidden_states.dtype) + ext_mask = (1.0 - ext_mask) * tf.constant(-10000.0, dtype=hidden_states.dtype) + + all_h = [] + if output_hidden_states: + all_h.append(hidden_states) + + for layer in self.encoder_layers: + hidden_states = layer(hidden_states, attention_mask=ext_mask, training=training) + if output_hidden_states: + all_h.append(hidden_states) + + last_hidden_state = hidden_states + colbert_vecs = self.colbert_linear(last_hidden_state[:, 1:]) + colbert_vecs = colbert_vecs * tf.cast(attention_mask[:, 1:][:, :, None], dtype=tf.float32) + + out = {"last_hidden_state": last_hidden_state, "colbert_vecs": colbert_vecs} + if output_hidden_states: + out["hidden_states"] = tf.stack(all_h, axis=0) + return out diff --git a/BGEM3WeightConverter.py b/BGEM3WeightConverter.py index 8164086..e6a26c2 100644 --- a/BGEM3WeightConverter.py +++ b/BGEM3WeightConverter.py @@ -16,8 +16,7 @@ def load_sparse_weights(): raise FileNotFoundError(f"FileNotFoundError: {model_path}") device = 'cuda' if torch.cuda.is_available() else 'cpu' - # Avoid weights_only for broader PyTorch compatibility - return torch.load(model_path, map_location=device) + return torch.load(model_path, map_location=device, weights_only=True) def load_colbert_weights(): @@ -56,58 +55,11 @@ def _init_colbert_weights(tf_model): colbert = load_colbert_weights() colbert_weights = colbert['weight'] colbert_bias = colbert['bias'] - # Convert to numpy and report shape - w = colbert_weights.detach().cpu().numpy() if hasattr(colbert_weights, "detach") else np.array(colbert_weights) - b = colbert_bias.detach().cpu().numpy() if hasattr(colbert_bias, "detach") else np.array(colbert_bias) - out_dim, in_dim = w.shape # PT: (out_dim, in_dim) - print(f"ColBERT head weight shape: (out_dim={out_dim}, in_dim={in_dim})") - - # Ensure the Dense layer has matching units and is built - try: - current_units = getattr(tf_model.colbert_linear, "units", None) - except Exception: - current_units = None - - if current_units is not None and current_units != out_dim: - # Units mismatch; warn. Ideally create the model with detected colbert_dim to avoid this. - print(f"Warning: colbert_linear units ({current_units}) != detected out_dim ({out_dim}). We will attempt to set weights and may fail.") - - # Ensure variables exist. If not built yet, do a dummy call to build with correct in_dim. - if not getattr(tf_model.colbert_linear, "built", False): - dummy = tf.zeros((1, 2, in_dim), dtype=tf.float32) - _ = tf_model.colbert_linear(dummy) - - # Set weights (kernel shape: (in_dim, out_dim)) - tf_model.colbert_linear.set_weights([w.T, b]) - - -def _init_sparse_weights(tf_model): - """Initialize sparse head weights if available (optional).""" - try: - st = load_sparse_weights() - except FileNotFoundError as e: - print(str(e)) - return - - # Expect PyTorch shape: (out_dim=1, in_dim=hidden) - w_pt = st["weight"] - b_pt = st["bias"] - # Ensure numpy - if hasattr(w_pt, "cpu"): - w_np = w_pt.cpu().numpy() - else: - w_np = np.array(w_pt) - if hasattr(b_pt, "cpu"): - b_np = b_pt.cpu().numpy() - else: - b_np = np.array(b_pt) - - # Build layer if not built - in_dim = w_np.shape[1] - tf_model.sparse_linear.build((None, None, in_dim)) - # Keras Dense kernel shape: (in_dim, out_dim) - tf_model.sparse_linear.set_weights([w_np.T, b_np]) + tf_model.colbert_linear.set_weights([ + colbert_weights.numpy().T, + colbert_bias.numpy() + ]) class BGEM3WeightConverter: @@ -133,15 +85,15 @@ def initialize_weights(self, tf_model): # Initialize encoder layers self._init_transformer_blocks(tf_model) - # Initialize pooler (once) + # Initialize pooler + self._init_pooler_weights(tf_model) + + # Initialize pooler self._init_pooler_weights(tf_model) # Initialize colbert _init_colbert_weights(tf_model) - # Initialize sparse head (optional) - _init_sparse_weights(tf_model) - return tf_model def _init_embedding_weights(self, tf_model): @@ -278,28 +230,9 @@ def _init_pooler_weights(self, tf_model): def convert_and_save_model(model_name: str, save_path: str): - """Convert PyTorch model to TensorFlow and save. - Also detects and uses original ColBERT dimension for TF head. - """ - # Detect ColBERT original dimension from weights (out_dim) - try: - colbert = load_colbert_weights() - colbert_w = colbert['weight'] - out_dim = int(colbert_w.shape[0]) - print(f"Detected ColBERT dimension: {out_dim}") - colbert_dim = out_dim - return_colbert_vecs = True - except Exception as e: - print(f"ColBERT weights not found or failed to load: {e}") - colbert_dim = -1 - return_colbert_vecs = False - - # Initialize TensorFlow model with detected colbert_dim - tf_model = BGEM3TensorFlow( - model_name, - colbert_dim=colbert_dim, - return_colbert_vecs=return_colbert_vecs, - ) + """Convert PyTorch model to TensorFlow and save""" + # Initialize TensorFlow model + tf_model = BGEM3TensorFlow(model_name) # Convert weights converter = BGEM3WeightConverter(model_name) diff --git a/export_tf1_saved_model.py b/export_tf1_saved_model.py new file mode 100644 index 0000000..38cebb7 --- /dev/null +++ b/export_tf1_saved_model.py @@ -0,0 +1,369 @@ +import os +import argparse +import numpy as np +import torch +import tensorflow as tf +from transformers import AutoTokenizer +# from BGEM3TFModel_tfkeras2 import BGEM3TensorFlow + +os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "1") + +def _np(t) -> np.ndarray: + if hasattr(t, "detach"): + t = t.detach() + if hasattr(t, "cpu"): + t = t.cpu() + arr = np.array(t) + if arr.dtype != np.float32: + arr = arr.astype(np.float32) + return arr + +def load_state_dict(model_or_path: str) -> dict: + if os.path.isdir(model_or_path): + pt = os.path.join(model_or_path, "pytorch_model.bin") + else: + from huggingface_hub import snapshot_download + local = snapshot_download(repo_id=model_or_path) + pt = os.path.join(local, "pytorch_model.bin") + print(f"[load] pytorch_model.bin: {pt}") + return torch.load(pt, map_location="cpu") + +def load_colbert_weight(model_or_path: str): + try: + if os.path.isdir(model_or_path): + p = os.path.join(model_or_path, "colbert_linear.pt") + else: + from huggingface_hub import snapshot_download + local = snapshot_download(repo_id=model_or_path) + p = os.path.join(local, "colbert_linear.pt") + st = torch.load(p, map_location="cpu") + if isinstance(st, dict): + W = st.get("weight") + B = st.get("bias") + if W is None: + first_key = next(iter(st)) + W = st[first_key] + B = st.get("bias", None) + else: + W, B = (st, None) if not isinstance(st, (list, tuple)) else st + W = _np(W) + B = _np(B) if B is not None else None + print(f"[colbert] head: out_dim={W.shape[0]}, in_dim={W.shape[1]}, bias={'yes' if B is not None else 'no'}") + return W, B + except Exception as e: + print(f"[colbert] not found ({e}); skipping)") + return None, None + +def _to_tf_dtype(d) -> tf.dtypes.DType: + try: + return tf.as_dtype(d) + except Exception: + if hasattr(d, "dtype"): + try: + return tf.as_dtype(d.dtype) + except Exception: + pass + if isinstance(d, str): + return tf.as_dtype(d) + return tf.float32 + +def assign_via_feed(sess: tf.compat.v1.Session, var, value_np: np.ndarray): + """ + 그래프에 대용량 Const를 남기지 않도록 placeholder feed 기반으로 assign. + var.dtype가 문자열("float32")이어도 안전하게 동작하도록 tf.as_dtype로 강제변환. + """ + # KerasVariable -> tf.Variable 강제 변환 (그래프 모드에서 안전) + var = _as_tf_variable(var) + # var.dtype이 'float32' 같은 문자열일 수 있으므로 반드시 캐스팅 + dtype = tf.as_dtype(getattr(var, "dtype", tf.float32)) + try: + base_dtype = dtype.base_dtype + except Exception: + base_dtype = dtype + + ph = tf.compat.v1.placeholder( + dtype=base_dtype, + shape=value_np.shape, + name=var.name.split(":")[0] + "_ph", + ) + # 그래프 모드 assign을 명시적으로 사용 + op = tf.compat.v1.assign(var, ph) + sess.run(op, feed_dict={ph: value_np}) + +def _force_build_dense(layer: tf.keras.layers.Dense, hidden_size: int): + dummy = tf.zeros([1, hidden_size], dtype=tf.float32) + _ = layer(dummy) + +def _debug_dump_embedding_stats(sess, tf_model): + """ + TF1 그래프 모드에서 Keras 3 변수를 안전하게 읽어서 통계를 출력. + KerasVariable -> (var.value) -> (read_value()) -> Tensor -> sess.run() + """ + def _eval(sess, var_like): + v = _as_tf_variable(var_like) + # 최종적으로 Variable/Tensor를 fetch + return sess.run(v) + + w = _eval(sess, tf_model.weight) + p = _eval(sess, tf_model.position_embeddings) + t = _eval(sess, tf_model.token_type_embeddings) + g = _eval(sess, tf_model.layerNorm.gamma) + b = _eval(sess, tf_model.layerNorm.beta) + + print(f"[check] word_emb mean={w.mean():.6f} std={w.std():.6f}") + print(f"[check] pos_emb mean={p.mean():.6f} std={p.std():.6f}") + print(f"[check] tok_emb mean={t.mean():.6f} std={t.std():.6f}") + print(f"[check] emb_LN γ mean={g.mean():.6f} std={g.std():.6f}") + print(f"[check] emb_LN β mean={b.mean():.6f} std={b.std():.6f}") + + +def _pt_style_l0(sd, input_ids_np, attention_mask_np, padding_idx=1, eps=1e-5): + """PyTorch 수식과 동일한 방식으로 임베딩+LayerNorm(L0) 계산.""" + we = _np(sd["embeddings.word_embeddings.weight"]) + pe = _np(sd["embeddings.position_embeddings.weight"]) + te = _np(sd["embeddings.token_type_embeddings.weight"]) # [type_vocab_size, H] + gamma = _np(sd["embeddings.LayerNorm.weight"]) + beta = _np(sd["embeddings.LayerNorm.bias"]) + + mask = attention_mask_np.astype(np.int32) + pos_ids = np.cumsum(mask, axis=1) * mask + padding_idx + + emb = we[input_ids_np] + pe[pos_ids] + te[0] + mean = emb.mean(axis=-1, keepdims=True) + var = ((emb - mean) ** 2).mean(axis=-1, keepdims=True) + xhat = (emb - mean) / np.sqrt(var + eps) + return xhat * gamma + beta + +def export_tf1_saved_model(model_name_or_path: str, out_root: str): + tf.keras.backend.clear_session() + tf.compat.v1.reset_default_graph() + tf.compat.v1.disable_eager_execution() + + print("[tokenizer] loading...") + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + + print("[state_dict] loading...") + sd = load_state_dict(model_name_or_path) + col_W, col_b = load_colbert_weight(model_name_or_path) + + g = tf.compat.v1.Graph() + with g.as_default(): + # Placeholders + input_ids = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_ids") + attention_mask = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="attention_mask") + input_ids_i64 = tf.compat.v1.placeholder(tf.int64, shape=[None, None], name="input_ids_i64") + attention_mask_i64 = tf.compat.v1.placeholder(tf.int64, shape=[None, None], name="attention_mask_i64") + + hidden_size = sd["embeddings.word_embeddings.weight"].shape[1] + tf_model = BGEM3TensorFlow( + vocab_size=sd["embeddings.word_embeddings.weight"].shape[0], + max_position_embeddings=sd["embeddings.position_embeddings.weight"].shape[0], + type_vocab_size=sd["embeddings.token_type_embeddings.weight"].shape[0], + hidden_size=hidden_size, + num_hidden_layers=24, num_attention_heads=16, intermediate_size=4096, + dropout_rate=0.0, name="bge-m3-tensorflow", + ) + + # 구현 타입 확인: 모두 TorchLayerNormTF1 이어야 함 + try: + print("[impl] embeddings LN:", type(tf_model.layerNorm)) + print("[impl] attn LN(0):", type(tf_model.encoder_layers[0].attention.attlayerNorm)) + print("[impl] ffn LN(0):", type(tf_model.encoder_layers[0].output_norm)) + except Exception: + pass + + # Build graph (variables created here) + outs32 = tf_model({"input_ids": input_ids, "attention_mask": attention_mask}, + training=False, output_hidden_states=True) + last32 = tf.identity(outs32["last_hidden_state"], name="last_hidden_state") + col32 = tf.identity(outs32["colbert_vecs"], name="colbert_vecs") + dense32 = tf.identity(last32[:, 0, :], name="dense_vecs") # raw CLS + hidd32 = tf.identity(outs32["hidden_states"], name="hidden_states") + + outs64 = tf_model({"input_ids": tf.cast(input_ids_i64, tf.int32), + "attention_mask": tf.cast(attention_mask_i64, tf.int32)}, + training=False, output_hidden_states=True) + last64 = tf.identity(outs64["last_hidden_state"], name="last_hidden_state_i64") + col64 = tf.identity(outs64["colbert_vecs"], name="colbert_vecs_i64") + dense64 = tf.identity(last64[:, 0, :], name="dense_vecs_i64") # raw CLS + hidd64 = tf.identity(outs64["hidden_states"], name="hidden_states_i64") + + # pooler 강제 빌드(검증용) + _force_build_dense(tf_model.pooler, hidden_size) + + init = tf.compat.v1.global_variables_initializer() + with tf.compat.v1.Session(graph=g) as sess: + sess.run(init) + + # === Embedding block === + assign_via_feed(sess, tf_model.weight, _np(sd["embeddings.word_embeddings.weight"])) + assign_via_feed(sess, tf_model.position_embeddings, _np(sd["embeddings.position_embeddings.weight"])) + assign_via_feed(sess, tf_model.token_type_embeddings, _np(sd["embeddings.token_type_embeddings.weight"])) + assign_via_feed(sess, tf_model.layerNorm.gamma, _np(sd["embeddings.LayerNorm.weight"])) + assign_via_feed(sess, tf_model.layerNorm.beta, _np(sd["embeddings.LayerNorm.bias"])) + + # 즉시 숫자 확인 (여기서 이상하면 레이어 0부터 틀어집니다) + _debug_dump_embedding_stats(sess, tf_model) + + # === Encoder blocks === + for i, blk in enumerate(tf_model.encoder_layers): + assign_via_feed(sess, blk.attention.wq.kernel, _np(sd[f"encoder.layer.{i}.attention.self.query.weight"]).T) + assign_via_feed(sess, blk.attention.wq.bias, _np(sd[f"encoder.layer.{i}.attention.self.query.bias"])) + assign_via_feed(sess, blk.attention.wk.kernel, _np(sd[f"encoder.layer.{i}.attention.self.key.weight"]).T) + assign_via_feed(sess, blk.attention.wk.bias, _np(sd[f"encoder.layer.{i}.attention.self.key.bias"])) + assign_via_feed(sess, blk.attention.wv.kernel, _np(sd[f"encoder.layer.{i}.attention.self.value.weight"]).T) + assign_via_feed(sess, blk.attention.wv.bias, _np(sd[f"encoder.layer.{i}.attention.self.value.bias"])) + + assign_via_feed(sess, blk.attention.dense.kernel, _np(sd[f"encoder.layer.{i}.attention.output.dense.weight"]).T) + assign_via_feed(sess, blk.attention.dense.bias, _np(sd[f"encoder.layer.{i}.attention.output.dense.bias"])) + assign_via_feed(sess, blk.attention.attlayerNorm.gamma, + _np(sd[f"encoder.layer.{i}.attention.output.LayerNorm.weight"])) + assign_via_feed(sess, blk.attention.attlayerNorm.beta, + _np(sd[f"encoder.layer.{i}.attention.output.LayerNorm.bias"])) + + assign_via_feed(sess, blk.intermediate.kernel, _np(sd[f"encoder.layer.{i}.intermediate.dense.weight"]).T) + assign_via_feed(sess, blk.intermediate.bias, _np(sd[f"encoder.layer.{i}.intermediate.dense.bias"])) + assign_via_feed(sess, blk.output_dense.kernel, _np(sd[f"encoder.layer.{i}.output.dense.weight"]).T) + assign_via_feed(sess, blk.output_dense.bias, _np(sd[f"encoder.layer.{i}.output.dense.bias"])) + assign_via_feed(sess, blk.output_norm.gamma, _np(sd[f"encoder.layer.{i}.output.LayerNorm.weight"])) + assign_via_feed(sess, blk.output_norm.beta, _np(sd[f"encoder.layer.{i}.output.LayerNorm.bias"])) + + if i % 4 == 0 or i == len(tf_model.encoder_layers) - 1: + print(f"[encoder {i}] weights mapped") + + # pooler (검증용, dense_vecs에는 사용 안 함) + if "pooler.dense.weight" in sd and "pooler.dense.bias" in sd: + assign_via_feed(sess, tf_model.pooler.kernel, _np(sd["pooler.dense.weight"]).T) + assign_via_feed(sess, tf_model.pooler.bias, _np(sd["pooler.dense.bias"])) + print("[pooler] initialized") + + # colbert + if col_W is not None: + assign_via_feed(sess, tf_model.colbert_linear.kernel, col_W.T) + if hasattr(tf_model.colbert_linear, "bias") and tf_model.colbert_linear.bias is not None: + if col_b is None: + col_b = np.zeros((col_W.shape[0],), dtype=np.float32) + assign_via_feed(sess, tf_model.colbert_linear.bias, col_b) + print(f"[colbert] initialized (out_dim={col_W.shape[0]}, in_dim={col_W.shape[1]})") + else: + print("[colbert] weights not found; skipping") + + # 내장 Sanity Check: L0가 PT와 붙는지 확인 (저장 이전) + try: + ids = tokenizer(["hello"], padding=True, truncation=True, max_length=8) + inp_ids = np.array(ids["input_ids"], dtype=np.int32) + att_msk = np.array(ids["attention_mask"], dtype=np.int32) + pt_l0 = _pt_style_l0(sd, inp_ids, att_msk, padding_idx=1, eps=1e-5) + tf_l0 = sess.run(hidd32[0], feed_dict={input_ids: inp_ids, attention_mask: att_msk}) + mse_l0 = np.mean((pt_l0 - tf_l0) ** 2) + print(f"[sanity] L0 MSE vs PT: {mse_l0:.8f}") + assert mse_l0 < 1e-6, "Embedding+LayerNorm (L0) mismatch; abort saving!" + except Exception as e: + raise + + # Signatures + sig_default = tf.compat.v1.saved_model.signature_def_utils.predict_signature_def( + inputs={"input_ids": input_ids, "attention_mask": attention_mask}, + outputs={"dense_vecs": dense32, "last_hidden_state": last32, "hidden_states": hidd32, + "colbert_vecs": col32}, + ) + sig_int64 = tf.compat.v1.saved_model.signature_def_utils.predict_signature_def( + inputs={"input_ids_i64": input_ids_i64, "attention_mask_i64": attention_mask_i64}, + outputs={"dense_vecs_i64": dense64, "last_hidden_state_i64": last64, "hidden_states_i64": hidd64, + "colbert_vecs_i64": col64}, + ) + + export_dir = os.path.join(out_root, "model") + os.makedirs(export_dir, exist_ok=True) + + # ★ 여기서 '모든 변수'를 모아 커스텀 Saver를 만든다 + var_list = _collect_all_variables_for_saver(tf_model) + saver = tf.compat.v1.train.Saver( + var_list=var_list, + write_version=tf.compat.v1.train.SaverDef.V2, + save_relative_paths=True, + ) + + builder = tf.compat.v1.saved_model.Builder(export_dir) + builder.add_meta_graph_and_variables( + sess, + tags=[tf.compat.v1.saved_model.tag_constants.SERVING], + signature_def_map={"serving_default": sig_default, "serving_int64": sig_int64}, + clear_devices=True, + saver=saver, # ★ 커스텀 Saver 지정 (중요) + ) + builder.save() + print(f"[export] TF1 SavedModel saved to: {export_dir}") + + tokenizer.save_pretrained(out_root) + print(f"[export] tokenizer saved to: {out_root}") + +def _as_tf_variable(v): + """KerasVariable -> tf.Variable 로 변환 (그래프 모드). 이미 tf.Variable이면 그대로 반환.""" + try: + # 가장 안전: 이미 tf.Variable 계열이면 그대로 사용 + if isinstance(v, tf.Variable): + return v + except Exception: + pass + # Keras 3의 래퍼가 내부 변수에 접근자를 제공할 수 있음 + inner = getattr(v, "variable", None) + if isinstance(inner, tf.Variable): + return inner + inner2 = getattr(v, "_variable", None) + if isinstance(inner2, tf.Variable): + return inner2 + # 일부는 .value가 property일 수 있으나, method인 경우가 있어 호출/반환 지양 + if hasattr(v, "value") and not callable(getattr(v, "value")): + inner3 = getattr(v, "value") + if isinstance(inner3, tf.Variable): + return inner3 + return v + +def _collect_all_variables_for_saver(tf_model): + """ + Saver에 전달할 '완전한' 변수 목록을 구성. + - Keras 3의 tf_model.variables (KerasVariable) 포함 + - TF1 컬렉션의 global/trainable/model 변수 포함 + - 이름으로 dedup + """ + vars_from_keras = [_as_tf_variable(v) for v in getattr(tf_model, "variables", [])] + + vars_global = list(tf.compat.v1.global_variables()) + vars_train = list(tf.compat.v1.trainable_variables()) + try: + vars_model = list(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.MODEL_VARIABLES)) + except Exception: + vars_model = [] + + all_vars = vars_from_keras + vars_global + vars_train + vars_model + + # 이름 기준 dedup (handle/참조가 달라도 같은 변수면 이름이 동일) + dedup = [] + seen = set() + for v in all_vars: + try: + name = v.name # e.g. 'bge-m3-tensorflow/...:0' + except Exception: + continue + if name not in seen: + seen.add(name) + dedup.append(v) + + # 디버그: 저장할 변수 개수/샘플 이름 출력 + print(f"[saver] variables to save: {len(dedup)}") + for nm in list(seen)[:5]: + print(" -", nm) + + return dedup + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default="BAAI/bge-m3") + parser.add_argument("--out", type=str, default="./converted_bge_m3_tf1_java_fixed") + args = parser.parse_args() + export_tf1_saved_model(args.model, args.out) + +if __name__ == "__main__": + main() diff --git a/model_conversion_validator.py b/model_conversion_validator.py index a417943..a054ed1 100644 --- a/model_conversion_validator.py +++ b/model_conversion_validator.py @@ -1,391 +1,168 @@ -import torch +# model_conversion_validator.py import numpy as np +import torch import tensorflow as tf from transformers import AutoTokenizer, AutoModel + def load_original_pytorch_model(model_name_or_path): - """ - 원본 Hugging Face(PyTorch) 모델 및 토크나이저를 로드한 뒤, - (model, tokenizer)를 반환합니다. - """ - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - - model = AutoModel.from_pretrained(model_name_or_path) - model.eval() # 평가 모드 - return model, tokenizer - - -def encode_with_pytorch_model( - model, - tokenizer, - queries, - max_length=128, - use_cls_pooling=True, - return_hidden_states=True -): - """ - PyTorch 모델로 임베딩 추출하는 함수. - use_cls_pooling=True이면 [CLS] 임베딩 반환, - False이면 Attention Mask 기반 mean pooling을 반환. - return_hidden_states=True 이면, 모든 레이어의 히든 스테이트도 반환. - """ - inputs = tokenizer( - queries, - padding=True, - truncation=True, - max_length=max_length, - return_tensors='pt' - ) + tok = AutoTokenizer.from_pretrained(model_name_or_path) + mdl = AutoModel.from_pretrained(model_name_or_path) + mdl.eval() + return mdl, tok - with torch.no_grad(): - outputs = model(**inputs, output_hidden_states=return_hidden_states) - hidden_states = outputs.last_hidden_state # (batch_size, seq_len, hidden_size) +def encode_with_pytorch_model(model, tokenizer, queries, max_length=128, use_cls_pooling=True): + inputs = tokenizer(queries, padding=True, truncation=True, max_length=max_length, return_tensors="pt") + with torch.no_grad(): + outputs = model(**inputs, output_hidden_states=True) + hidden_states = outputs.last_hidden_state # [B,T,H] + all_layer_outputs = outputs.hidden_states # tuple(len=emb+24) if use_cls_pooling: - # [CLS] 벡터 사용 - embeddings = hidden_states[:, 0, :] - else: - # Mean Pooling - attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(hidden_states.size()).float() - sum_embeddings = torch.sum(hidden_states * attention_mask, dim=1) - sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9) - embeddings = sum_embeddings / sum_mask - - if return_hidden_states: - # outputs.hidden_states: 튜플 (embedding_layer_output + 각 Transformer 레이어 출력) - all_layer_outputs = outputs.hidden_states # tuple of torch.Tensor - return embeddings.cpu().numpy(), all_layer_outputs + emb = hidden_states[:, 0, :].cpu().numpy() else: - return embeddings.cpu().numpy() + attn = inputs["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float() + sum_embeddings = torch.sum(hidden_states * attn, dim=1) + sum_mask = torch.clamp(attn.sum(dim=1), min=1e-9) + emb = (sum_embeddings / sum_mask).cpu().numpy() + + + return emb, all_layer_outputs def show_all_layer_outputs_pytorch(all_layer_outputs, print_values=False): - """ - PyTorch 레이어별 히든 스테이트의 shape 및 (옵션) 일부 실제 값을 출력하는 유틸 함수. - """ print("\n[PyTorch] All Layer Outputs:") for i, hs in enumerate(all_layer_outputs): - print(f" Layer {i} hidden state shape: {hs.shape}") + print(f" Layer {i} hidden state shape: {tuple(hs.shape)}") if print_values: - # 첫 배치, 첫 토큰, 앞 5개 차원 - sample_vals = hs[0, 0, :5] - print(f" Sample values (batch=0, token=0, dims=0~4): {sample_vals.cpu().numpy()}") - print() - - -def load_converted_tf_model(saved_model_dir): - """ - TF SavedModel 디렉토리에서 모델을 로드하고, - 같은 경로에 있는 토크나이저를 함께 로드합니다. - - - convert_and_save_model()나 save_model_with_tokenizer()로 - "model" 폴더와 토크나이저 저장 가정. - """ - model_path = f"{saved_model_dir}/model" - loaded_model = tf.saved_model.load(model_path) - # Prefer a compatible signature if available - prefer = [ - "serving_default", - "serving_int32_3in", - "serving_int64_3in", - "serving_int32_2in", - "serving_int64_2in", - ] - sigs = loaded_model.signatures - for k in prefer: - if k in sigs: - serving_fn = sigs[k] - break - else: - raise RuntimeError("No suitable serving signature found in SavedModel.") - - tokenizer = AutoTokenizer.from_pretrained(saved_model_dir) - return serving_fn, tokenizer - - -def call_signature(sig, input_ids, attention_mask, token_type_ids=None): - """ - Call SavedModel signature with automatic key/dtype adaptation. - - Supplies only required keys - - Fills missing token_type_ids with zeros - - Casts inputs to signature dtypes - """ - # structured_input_signature: (args, kwargs) - spec_kwargs = sig.structured_input_signature[1] - - def prepare(name, value): - if name not in spec_kwargs: - return None - if value is None and name == "token_type_ids": - value = tf.zeros_like(input_ids) - want = spec_kwargs[name].dtype - if hasattr(value, "dtype") and value.dtype != want: - value = tf.cast(value, want) - return value - - kwargs = {} - x = prepare("input_ids", input_ids) - if x is not None: - kwargs["input_ids"] = x - x = prepare("attention_mask", attention_mask) - if x is not None: - kwargs["attention_mask"] = x - x = prepare("token_type_ids", token_type_ids) - if x is not None: - kwargs["token_type_ids"] = x - - return sig(**kwargs) + print(" sample:", hs[0, 0, :5].cpu().numpy()) -def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128): - """ - TensorFlow 모델(서빙 시그니처)로 임베딩 추출하는 함수. - SavedModel은 last_hidden_state (B,T,H)만 반환하므로 CLS 풀링을 적용해 (B,H) 임베딩 생성. - """ - inputs = tokenizer( - queries, - padding=True, - truncation=True, - max_length=max_length, - return_tensors="tf" - ) +def load_converted_tf_model(saved_root_dir: str): + model_dir = f"{saved_root_dir}/model" + loaded = tf.saved_model.load(model_dir) + sig = loaded.signatures["serving_default"] + tok = AutoTokenizer.from_pretrained(saved_root_dir) + return sig, tok - token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"])) - outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids) - # Serving returns last_hidden_state (B, T, H); apply CLS pooling for embedding - last_hidden = outputs["last_hidden_state"] # (B, T, H) - embeddings = last_hidden[:, 0, :].numpy() # (B, H) - - return embeddings - - -def encode_with_tf_model_and_get_hidden_states(serving_fn, tokenizer, queries, max_length=128): - """ - *주의*: - - TF SavedModel에서 레이어별 히든 스테이트도 반환한다고 가정할 때 사용 가능. - - 실제 변환된 모델이 'all_hidden_states'라는 키를 노출하지 않았다면 KeyError 발생 가능. - """ - inputs = tokenizer( - queries, - padding=True, - truncation=True, - max_length=max_length, - return_tensors="tf" - ) - token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"])) - outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids) +def call_signature(sig, input_ids, attention_mask): + # 강제 int32 캐스트 + if input_ids.dtype != tf.int32: + input_ids = tf.cast(input_ids, tf.int32) + if attention_mask.dtype != tf.int32: + attention_mask = tf.cast(attention_mask, tf.int32) + return sig(input_ids=input_ids, attention_mask=attention_mask) - # Only last_hidden_state is returned in serving; keep KeyError behavior for old path - hidden_states = outputs["hidden_states"] # will raise KeyError (by design) - final_embeddings = outputs["last_hidden_state"] - if "colbert_vecs" in outputs: - print("outputs['colbert_vecs'] : ") - print(outputs["colbert_vecs"]) - else: - print("colbert_vecs not returned by TF model (flag disabled).") - return final_embeddings.numpy(), hidden_states +def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128): + inputs_pt = tokenizer(queries, padding=True, truncation=True, max_length=max_length, return_tensors="pt") + inputs_tf = tokenizer(queries, padding=True, truncation=True, max_length=max_length, return_tensors="tf") + # 1) 입력 동일성 보장 (매우 중요) + assert np.array_equal(inputs_pt["input_ids"].numpy(), inputs_tf["input_ids"].numpy()), "PT/TF input_ids mismatch" + assert np.array_equal(inputs_pt["attention_mask"].numpy(), inputs_tf["attention_mask"].numpy()), "PT/TF mask mismatch" -def show_all_layer_outputs_tf(all_layer_outputs, print_values=False): - """ - TensorFlow 레이어별 히든 스테이트 shape와 (옵션) 일부 실제 값을 출력 - (가정: all_layer_outputs가 (num_layers, batch, seq_len, hidden_dim) 형태) - """ - print("\n[TensorFlow] All Layer Outputs:") - for i, hs in enumerate(all_layer_outputs): - print(f" Layer {i} hidden state shape: {hs.shape}") - if print_values: - # 첫 배치, 첫 토큰, 앞 5개 차원 - sample_vals = hs[0, 0, :5].numpy() - print(f" Sample values (batch=0, token=0, dims=0~4): {sample_vals}") - print() + outputs = serving_fn( + input_ids=tf.cast(inputs_tf["input_ids"], tf.int32), + attention_mask=tf.cast(inputs_tf["attention_mask"], tf.int32), + ) + print(f'outputs >> {outputs}') + last_hidden = outputs["last_hidden_state"] # [B,T,H] + emb = last_hidden[:, 0, :].numpy() + hiddens = outputs.get("hidden_states", None) # (L+1,B,T,H) + print(f'hiddens, {hiddens}') + return emb, (hiddens.numpy() if hiddens is not None else None) def cosine_similarity(a, b): - """ - (batch_size, hidden_dim) 형태 numpy 배열 a, b에 대해 - 벡터별 코사인 유사도(batch_size,) 반환 - """ - a_norm = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-9) - b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-9) - cos_sim = np.sum(a_norm * b_norm, axis=1) - return cos_sim + a = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-9) + b = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-9) + return np.sum(a * b, axis=1) def mse(a, b): return np.mean((a - b) ** 2) -def compare_layer_outputs(pt_all_layer_outputs, tf_all_layer_outputs): - """ - PyTorch vs. TensorFlow 레이어별로 MSE, Cosine Similarity 등을 비교해주는 함수. - - pt_all_layer_outputs: tuple of torch.Tensor (길이: num_layers_PyTorch) - (예: [embedding_output, layer1_output, layer2_output, ...]) - - tf_all_layer_outputs: tf.Tensor (shape: [num_layers_TF, batch_size, seq_len, hidden_dim]) - (예: 0번이 embedding_output, 1번이 1번 레이어, ...) - """ - print("\n=== Compare Layer Outputs (PyTorch vs TensorFlow) ===") - - num_pt_layers = len(pt_all_layer_outputs) - num_tf_layers = tf_all_layer_outputs.shape[0] - min_layers = min(num_pt_layers, num_tf_layers) - - - layer_names = { - 0: "Embedding Layer", - } - for i in range(1, min_layers): - layer_names[i] = f"Encoder Layer {i}" - - print("pt_all_layer_outputs", len(pt_all_layer_outputs)) - - print("tf_all_layer_outputs", len(tf_all_layer_outputs)) - - for layer_idx in range(min_layers): - pt_layer = pt_all_layer_outputs[layer_idx] # shape: [batch, seq_len, hidden_dim] - tf_layer = tf_all_layer_outputs[layer_idx] # shape: [batch, seq_len, hidden_dim] - tf_layer_np = tf_layer.numpy() +def manual_l0_from_pt(sd, input_ids_np, attention_mask_np, padding_idx=1, eps=1e-5): + we = sd["embeddings.word_embeddings.weight"].cpu().numpy().astype(np.float32) + pe = sd["embeddings.position_embeddings.weight"].cpu().numpy().astype(np.float32) + te = sd["embeddings.token_type_embeddings.weight"].cpu().numpy().astype(np.float32) + gamma = sd["embeddings.LayerNorm.weight"].cpu().numpy().astype(np.float32) + beta = sd["embeddings.LayerNorm.bias"].cpu().numpy().astype(np.float32) - print(f"\n{layer_names[layer_idx]}:") - print(f"\n{layer_names[layer_idx]}:") - print(f"PyTorch shape: {pt_layer.shape}") - print(f" dims: [batch_size={pt_layer.shape[0]}, seq_len={pt_layer.shape[1]}, hidden_dim={pt_layer.shape[2]}]") - print(f"TensorFlow shape: {tf_layer.shape}") - print(f" dims: [batch_size={tf_layer.shape[0]}, seq_len={tf_layer.shape[1]}, hidden_dim={tf_layer.shape[2]}]") + # HF와 동일: attention_mask로 포지션 ID 생성 + mask = attention_mask_np.astype(np.int32) + pos_ids = np.cumsum(mask, axis=1) * mask + padding_idx - layer_mse = mse(pt_layer.detach().cpu().numpy(), tf_layer_np) - pt_cls_vec = pt_layer[0, 0, :].detach().cpu().numpy() + emb = we[input_ids_np] + pe[pos_ids] + te[0] # type_vocab_size == 1 + mean = emb.mean(axis=-1, keepdims=True) + var = ((emb - mean) ** 2).mean(axis=-1, keepdims=True) # 모집단 분산 + xhat = (emb - mean) / np.sqrt(var + eps) + return xhat * gamma + beta # (B,T,H) - tf_cls_vec = tf_layer_np[0, 0, :] - cls_cos_sim = cosine_similarity(pt_cls_vec[np.newaxis, :], tf_cls_vec[np.newaxis, :])[0] - - print(f" -> MSE: {layer_mse:.6f}") - print(f" -> CLS Token Cosine Similarity: {cls_cos_sim:.6f}") - -# ===================== 추가한 함수: 레이어별 출력 비교 ===================== -def compare_layer_outputs1(pt_all_layer_outputs, tf_all_layer_outputs): - """ - PyTorch vs. TensorFlow 레이어별로 MSE, Cosine Similarity 등을 비교해주는 함수. - - pt_all_layer_outputs: tuple of torch.Tensor (길이: num_layers_PyTorch) - (예: [embedding_output, layer1_output, layer2_output, ...]) - - tf_all_layer_outputs: tf.Tensor (shape: [num_layers_TF, batch_size, seq_len, hidden_dim]) - (예: 0번이 embedding_output, 1번이 1번 레이어, ...) - """ - print("\n=== Compare Layer Outputs (PyTorch vs TensorFlow) ===") - - # PyTorch: len(pt_all_layer_outputs) = num_layers_PyTorch - # TensorFlow: tf_all_layer_outputs.shape[0] = num_layers_TF - num_pt_layers = len(pt_all_layer_outputs) - num_tf_layers = tf_all_layer_outputs.shape[0] - - # 두 모델 간 레이어 개수가 다를 수 있으므로, 비교 가능한 만큼만 비교 - min_layers = min(num_pt_layers, num_tf_layers) - - for layer_idx in range(min_layers): - pt_layer = pt_all_layer_outputs[layer_idx] # shape: [batch, seq_len, hidden_dim] - tf_layer = tf_all_layer_outputs[layer_idx] # shape: [batch, seq_len, hidden_dim] - tf_layer_np = tf_layer.numpy() - - # 일단 shape이 같은지 출력 - print(f"Layer {layer_idx}: PT {pt_layer.shape} vs TF {tf_layer.shape}") - - # MSE 계산 - layer_mse = mse(pt_layer.detach().cpu().numpy(), tf_layer_np) - # Cosine Sim: 여기서는 batch*seq_len 개 각 토큰별 벡터의 평균 코사인 유사도 등 - # 또는 첫 배치의 첫 토큰만 비교할 수도 있음 - # 여기서는 간단히 "CLS 토큰(즉 0번 token)에 대한 cos sim" 등 비교 예시 - pt_cls_vec = pt_layer[0, 0, :].detach().cpu().numpy() - tf_cls_vec = tf_layer_np[0, 0, :] - - print(pt_layer) - print(tf_layer_np) - cls_cos_sim = cosine_similarity(pt_cls_vec[np.newaxis, :], tf_cls_vec[np.newaxis, :])[0] - - print(f" -> MSE: {layer_mse:.6f}, CLS CosSim: {cls_cos_sim:.6f}") - print() - def main(): - # 경로 설정 (예: ./bge-m3, ./converted_bge_m3) - model_name_or_path = "BAAI/bge-m3" # PyTorch 원본 - saved_model_dir = "./converted_bge_m3" # TF 변환본 + pt_id = "BAAI/bge-m3" + tf_dir = "./converted_bge_m3_tf1_java_fixed" queries = [ - "이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?", + "이 모델은 무엇을 하는 모델인가요? 이 모델은 무엇을 하는 모델인가요?", "이 모델은 무엇을 하는 모델인가요?" ] - print("=== 1) PyTorch 모델 로드 및 인코딩 (레이어별 출력 포함) ===") - pt_model, pt_tokenizer = load_original_pytorch_model(model_name_or_path) - pt_embeddings, pt_all_layer_outputs = encode_with_pytorch_model( - pt_model, - pt_tokenizer, - queries, - max_length=128, - use_cls_pooling=True, - return_hidden_states=True - ) - show_all_layer_outputs_pytorch(pt_all_layer_outputs, print_values=False) - - print("=== 2) TensorFlow 모델 로드 및 인코딩 ===") - tf_serving_fn, tf_tokenizer = load_converted_tf_model(saved_model_dir) - tf_embeddings = encode_with_tf_model( - tf_serving_fn, - tf_tokenizer, - queries, - max_length=128 - ) - - # (옵션) 레이어별 출력 노출 여부 확인 - try: - tf_embeddings_with_layers, tf_all_layer_outputs = encode_with_tf_model_and_get_hidden_states( - tf_serving_fn, - tf_tokenizer, - queries, - max_length=128 - ) - show_all_layer_outputs_tf(tf_all_layer_outputs, print_values=False) - - # [추가] 레이어별로 직접 비교 - compare_layer_outputs(pt_all_layer_outputs, tf_all_layer_outputs) - - print("[TensorFlow] Final Embeddings Shape:", tf_embeddings_with_layers.shape) - except KeyError: - print("TensorFlow 서빙 시그니처에 hidden_states가 없습니다. (기본 TF 변환본일 가능성)") - - print("\n=== 3) PT vs. TF 최종 임베딩 비교 ===") - - print(pt_embeddings) - print(tf_embeddings) - - cos_sims = cosine_similarity(pt_embeddings, tf_embeddings) - - errors = (pt_embeddings - tf_embeddings) - mse_val = mse(pt_embeddings, tf_embeddings) - - print("===== Queries =====") - for i, q in enumerate(queries): - print(f"[{i}] {q}") - print() - - print("===== PyTorch Embeddings (shape) =====") - print(pt_embeddings.shape) - print("===== TF Embeddings (shape) =====") - print(tf_embeddings.shape) - - print("\n===== Pairwise Cosine Similarity (PT vs TF) =====") - for i, cs in enumerate(cos_sims): - print(f"Query {i} Cosine Similarity: {cs:.4f}") - - print(f"\n===== MSE (PT vs TF) =====") - print(f"MSE: {mse_val:.6f}") - - print("\n===== Sample Differences (first query, first 5 dims) =====") - print(errors[0][:5]) + print("=== 1) PyTorch ===") + pt_model, pt_tok = load_original_pytorch_model(pt_id) + pt_emb, pt_layers = encode_with_pytorch_model(pt_model, pt_tok, queries, max_length=128) + show_all_layer_outputs_pytorch(pt_layers) + + print("=== 2) TensorFlow ===") + tf_sig, tf_tok = load_converted_tf_model(tf_dir) + tf_emb, tf_layers = encode_with_tf_model(tf_sig, tf_tok, queries, max_length=128) + + pt_l0 = pt_layers[0].detach().cpu().numpy() # (B,T,H) + tf_l0 = tf_layers[0] # (B,T,H) + print("L0 CLS head(PT)[:8]:", pt_l0[0, 0, :8]) + print("L0 CLS head(TF)[:8]:", tf_l0[0, 0, :8]) + + print("\n=== 3) Compare ===") + print("PT shape:", pt_emb.shape, "TF shape:", tf_emb.shape) + cs = cosine_similarity(pt_emb, tf_emb) + print("Cosine:", ["%.4f" % c for c in cs]) + print("MSE:", float(mse(pt_emb, tf_emb))) + + # 선택: 레이어별 비교 (있을 때만) + print(f'tf_layers, {tf_layers}') + if tf_layers is not None: + print("\n[Layer-wise] Cosine (PT vs TF):") + # pt_layers: tuple(len=L+1), tf_layers: (L+1,B,T,H) + tf_layers_np = tf_layers # (L+1,B,T,H) + for i in range(len(pt_layers)): + pt_l = pt_layers[i].detach().cpu().numpy() + tf_l = tf_layers_np[i] + c = cosine_similarity(pt_l[:, 0, :], tf_l[:, 0, :]) # CLS만 비교 + e = mse(pt_l, tf_l) + print(f" Layer {i:02d} cos={c.mean():.4f} mse={e:.6f}") + + from transformers import AutoModel + pt_model = AutoModel.from_pretrained("BAAI/bge-m3") + sd = pt_model.state_dict() + inputs_pt = pt_tok(queries, padding=True, truncation=True, max_length=128, return_tensors="pt") + + l0_manual = manual_l0_from_pt(sd, + inputs_pt["input_ids"].numpy(), + inputs_pt["attention_mask"].numpy(), + padding_idx=1, + eps=float(pt_model.config.layer_norm_eps)) + + pt_l0 = pt_layers[0].detach().cpu().numpy() + tf_l0 = tf_layers[0] + + print("Manual vs PT MSE:", np.mean((l0_manual - pt_l0) ** 2)) + print("Manual vs TF MSE:", np.mean((l0_manual - tf_l0) ** 2)) if __name__ == "__main__": diff --git a/tf1_session_validator.py b/tf1_session_validator.py new file mode 100644 index 0000000..b66c0fc --- /dev/null +++ b/tf1_session_validator.py @@ -0,0 +1,39 @@ +# tf1_session_validator.py +import argparse +import numpy as np +import tensorflow as tf +tf.compat.v1.disable_eager_execution() + +from tensorflow.python.saved_model import loader, tag_constants + +SIG = "serving_default" + +def inspect_and_run(model_dir: str, b=2, t=12): + print(f"[inspect] {model_dir}") + g = tf.Graph() + with g.as_default(): + with tf.compat.v1.Session(graph=g) as sess: + meta = loader.load(sess, [tag_constants.SERVING], model_dir) + sig = meta.signature_def[SIG] + + t_ids = g.get_tensor_by_name(sig.inputs["input_ids"].name) + t_msk = g.get_tensor_by_name(sig.inputs["attention_mask"].name) + t_last = g.get_tensor_by_name(sig.outputs["last_hidden_state"].name) + t_colb = g.get_tensor_by_name(sig.outputs["colbert_vecs"].name) + + print(" - inputs :", sig.inputs) + print(" - outputs:", sig.outputs) + + ids = np.random.randint(10, 1000, size=(b, t)).astype(np.int32) + msk = np.ones((b, t), dtype=np.int32) + + last, colb = sess.run([t_last, t_colb], feed_dict={t_ids: ids, t_msk: msk}) + print("last_hidden_state:", last.shape, last.dtype) + print("colbert_vecs :", colb.shape, colb.dtype) + print("✔ TF1 Session run OK") + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument("--dir", type=str, required=False, default="./converted_bge_m3_tf1_v1") + args = ap.parse_args() + inspect_and_run(args.dir) diff --git a/tf1_validator.py b/tf1_validator.py new file mode 100644 index 0000000..95bff8d --- /dev/null +++ b/tf1_validator.py @@ -0,0 +1,165 @@ +# torch_tf_validator.py +import argparse +import os +import numpy as np +import tensorflow as tf +import traceback + +# ========================================================================= +# 1. TF1 환경 설정 및 상수 +# ========================================================================= + +# TF1 환경 보장 (TF2 환경에서 실행 시 Eager Execution 비활성화) +try: + tf.compat.v1.disable_v2_behavior() + tf.compat.v1.disable_eager_execution() + print("INFO: Running in TF1 compatibility mode.") + tf1 = tf.compat.v1 +except Exception as e: + print("INFO: Running in native TF1 mode (or V2 behavior already disabled).") + tf1 = tf + +# Constants for TF1 SavedModel loading +try: + SAVED_MODEL_TAG = tf1.saved_model.tag_constants.SERVING # "serve" +except AttributeError: + # TF1 버전이 매우 낮을 경우 대비 + SAVED_MODEL_TAG = "serve" + +SIGNATURE_KEY = "serving_default" + + +# ========================================================================= +# 2. 검증 실행 함수 (격리된 그래프 사용) +# ========================================================================= + +# ★★★ 함수명이 run_validation으로 변경되었습니다. (이전: run_session_once) ★★★ +def run_validation(model_dir: str, dtype=np.int32, batch=2, seqlen=12): + print(f"[1] SavedModel 로드 및 시그니처 점검: {model_dir}") + + # ★★★★★★★★★★★ 핵심 수정 사항: 격리된 그래프 생성 ★★★★★★★★★★★ + # FailedPreconditionError의 원인인 이름 충돌(예: bge_m3_tensorflow_1)을 방지하기 위해, + # 모델을 기본(Default) 그래프가 아닌, 완전히 격리된 새 그래프로 로드합니다. + graph = tf.Graph() + + with graph.as_default(): + # 이 깨끗한 그래프와 연결된 세션을 생성합니다. + config = tf1.ConfigProto() + + # ★★★ 세션에 명시적으로 그래프 연결 ★★★ + with tf1.Session(graph=graph, config=config) as sess: + + # --- 1단계: 모델 로드 --- + print(f"Loading model into isolated graph...") + try: + # 모델을 깨끗한 그래프(graph)와 세션(sess)으로 로드 (TF1 loader 사용) + # WARNING 메시지는 무시해도 됩니다 (TF2 환경에서 TF1 loader 사용 시 발생) + meta_graph_def = tf1.saved_model.loader.load( + sess, + [SAVED_MODEL_TAG], + model_dir + ) + except Exception as e: + print(f"ERROR: Failed to load SavedModel from {model_dir}. Error: {e}") + return + + # --- 2단계: 시그니처 점검 --- + if SIGNATURE_KEY not in meta_graph_def.signature_def: + print(f"ERROR: Signature '{SIGNATURE_KEY}' not found.") + return + + signature_def = meta_graph_def.signature_def[SIGNATURE_KEY] + print(f" - 사용 시그니처: {SIGNATURE_KEY}") + + # Shape 출력을 위한 헬퍼 함수 (TF1 방식) + def format_shape(tensor_info): + try: + # TensorShapeProto에서 shape 추출 + return [d.size for d in tensor_info.tensor_shape.dim] + except: + return "Unknown" + + print(" - 입력들:") + for key, tensor_info in signature_def.inputs.items(): + print( + f" • key='{key}', dtype={tf.dtypes.as_dtype(tensor_info.dtype).name}, shape={format_shape(tensor_info)}, name='{tensor_info.name}'") + + print(" - 출력들:") + for key, tensor_info in signature_def.outputs.items(): + print( + f" • key='{key}', dtype={tf.dtypes.as_dtype(tensor_info.dtype).name}, shape={format_shape(tensor_info)}, name='{tensor_info.name}'") + + # --- 3단계: 추론 테스트 실행 --- + print(f"\n[2] TF1 세션으로 1회 추론 실행 (입력 dtype={dtype.__name__}, B={batch}, T={seqlen})") + + # 더미 입력 데이터 준비 (int32 요구) + input_ids_data = np.random.randint(100, 10000, size=(batch, seqlen)).astype(dtype) + attention_mask_data = np.ones((batch, seqlen)).astype(dtype) + + # 시그니처에서 입출력 텐서 이름 식별 + try: + input_ids_tname = signature_def.inputs['input_ids'].name + attention_mask_tname = signature_def.inputs['attention_mask'].name + last_h_tname = signature_def.outputs['last_hidden_state'].name + colbert_tname = signature_def.outputs['colbert_vecs'].name + except KeyError as e: + print(f"ERROR: Expected tensor key not found in signature: {e}") + return + + feed_dict = { + input_ids_tname: input_ids_data, + attention_mask_tname: attention_mask_data, + } + + fetches = [last_h_tname, colbert_tname] + + # 추론 실행 + try: + print("Running session...") + # ★★★ 그래프가 격리되었으므로 성공해야 합니다. ★★★ + last_h, colbert = sess.run(fetches, feed_dict=feed_dict) + + print("\n[SUCCESS] Inference successful!") + print(f" - last_hidden_state shape: {last_h.shape}, dtype: {last_h.dtype}") + print(f" - colbert_vecs shape: {colbert.shape}, dtype: {colbert.dtype}") + + except Exception as e: + print(f"\n[FAILURE] ERROR during inference: {type(e).__name__}") + if "FailedPreconditionError" in str(type(e)): + print("FailedPreconditionError가 여전히 발생했습니다.") + print("이는 모델 변환 과정(BGEM3WeightConverter.py)에서 이미 이름 불일치가 발생하여 저장되었음을 의미합니다.") + print("해결 방법: 모델 폴더(converted_bge_m3_tf1safe)를 삭제하고, 완전히 새로운 터미널에서 변환 스크립트를 다시 실행 후 검증하세요.") + traceback.print_exc() + + +# ========================================================================= +# 3. 실행 로직 +# ========================================================================= + +def main(): + default_model_dir = "./converted_bge_m3_tf1safe" + + parser = argparse.ArgumentParser(description="Validate converted BGE-M3 TensorFlow SavedModel in TF1 environment.") + parser.add_argument("--model_dir", type=str, default=default_model_dir, + help="Path to the SavedModel directory (e.g., converted_bge_m3_tf1safe)") + args = parser.parse_args() + + # 경로 확인 로직 (model 하위 폴더 자동 탐색) + model_path = args.model_dir + + # 1. 지정된 경로 확인 + if os.path.exists(os.path.join(model_path, "saved_model.pb")): + pass # 경로 정상 + # 2. 하위 'model' 폴더 확인 + elif os.path.exists(os.path.join(model_path, "model", "saved_model.pb")): + model_path = os.path.join(model_path, "model") + else: + print(f"Error: saved_model.pb not found in {args.model_dir} or {os.path.join(args.model_dir, 'model')}.") + return + + # ★★★ 수정된 함수 호출 ★★★ + run_validation(model_path, dtype=np.int32, batch=2, seqlen=12) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tf_colbert_loader.py b/tf_colbert_loader.py index 8005891..7625cb6 100644 --- a/tf_colbert_loader.py +++ b/tf_colbert_loader.py @@ -43,7 +43,7 @@ def call_signature(sig, input_ids, attention_mask, token_type_ids=None): def main(): parser = argparse.ArgumentParser(description="Load TF ColBERT SavedModel and print output shapes.") - parser.add_argument("--model_dir", default="./converted_bge_m3", help="Path to SavedModel root (contains tokenizer files).") + parser.add_argument("--model_dir", default="./converted_bge_m3_tf1_java", help="Path to SavedModel root (contains tokenizer files).") parser.add_argument("--max_length", type=int, default=128, help="Tokenization max length.") parser.add_argument("--texts", nargs="*", default=[ "이 모델은 무엇을 하는 모델인가요?", diff --git a/torch_tf_validator.py b/torch_tf_validator.py index b745ab4..1e39666 100644 --- a/torch_tf_validator.py +++ b/torch_tf_validator.py @@ -1,127 +1,81 @@ -import loguru - -from transformers import AutoTokenizer, AutoModel -import tensorflow as tf -import torch - -def load_torch_model(model_path): - model = AutoModel.from_pretrained(model_path) - return model - - -def load_tf_model(model_path): - with tf.device("/CPU:0"): - model = tf.saved_model.load(model_path) - return model - - -def load_tokenizer(model_path): - tokenizer = AutoTokenizer.from_pretrained(model_path) - return tokenizer - - -def tokenize_wo_padding(tokenizer, text, return_tensors="pt"): - return tokenizer(text, padding=False, return_tensors=return_tensors) - - -def tokenize_w_padding(tokenizer, text, return_tensors="pt", max_length=512): - return tokenizer(text, padding="max_length", max_length=max_length, return_tensors=return_tensors) - - -def call_signature(sig, input_ids, attention_mask, token_type_ids=None): - """Adapt arguments to a SavedModel signature: keys and dtypes.""" - spec_kwargs = sig.structured_input_signature[1] - - def prepare(name, value): - if name not in spec_kwargs: - return None - if value is None and name == "token_type_ids": - value = tf.zeros_like(input_ids) - want = spec_kwargs[name].dtype - if hasattr(value, "dtype") and value.dtype != want: - value = tf.cast(value, want) - return value - - kwargs = {} - x = prepare("input_ids", input_ids) - if x is not None: - kwargs["input_ids"] = x - x = prepare("attention_mask", attention_mask) - if x is not None: - kwargs["attention_mask"] = x - x = prepare("token_type_ids", token_type_ids) - if x is not None: - kwargs["token_type_ids"] = x - return sig(**kwargs) - +# torch_tf_validator.py +import argparse +import numpy as np +import tensorflow as tf + +tf.compat.v1.disable_eager_execution() + +TAG_SERVE = tf.saved_model.SERVING +SIG_NAME = "serving_default" + +def _tensor_shape_to_list(tensor_shape_proto): + dims = tensor_shape_proto.dim + if not dims: + return None + return [d.size for d in dims] + +def inspect_signature(model_dir: str): + print(f"\n[1] SavedModel 로드 및 시그니처 점검: {model_dir}") + g = tf.Graph() + with g.as_default(): + with tf.compat.v1.Session(graph=g) as sess: + meta_graph_def = tf.compat.v1.saved_model.load(sess, [TAG_SERVE], model_dir) + sigs = meta_graph_def.signature_def + if SIG_NAME not in sigs: + raise RuntimeError(f"Signature '{SIG_NAME}' not found. Available: {list(sigs.keys())}") + sig = sigs[SIG_NAME] + print(f" - 사용 시그니처: {SIG_NAME}") + print(" - 입력들:") + for k, tinfo in sig.inputs.items(): + print(f" • key='{k}', dtype={tf.DType(tinfo.dtype).name}, " + f"shape={_tensor_shape_to_list(tinfo.tensor_shape)}, name='{tinfo.name}'") + print(" - 출력들:") + for k, tinfo in sig.outputs.items(): + print(f" • key='{k}', dtype={tf.DType(tinfo.dtype).name}, " + f"shape={_tensor_shape_to_list(tinfo.tensor_shape)}, name='{tinfo.name}'") + +def run_session_once(model_dir: str, dtype=np.int32, batch=2, seqlen=12): + print(f"\n[2] TF1 세션으로 1회 추론 실행 (입력 dtype={np.dtype(dtype).name}, B={batch}, T={seqlen})") + g = tf.Graph() + with g.as_default(): + with tf.compat.v1.Session(graph=g) as sess: + meta_graph_def = tf.compat.v1.saved_model.load(sess, [TAG_SERVE], model_dir) + sig = meta_graph_def.signature_def[SIG_NAME] + + t_input_ids = g.get_tensor_by_name(sig.inputs["input_ids"].name) + t_attention = g.get_tensor_by_name(sig.inputs["attention_mask"].name) + t_last = g.get_tensor_by_name(sig.outputs["last_hidden_state"].name) + t_colbert = g.get_tensor_by_name(sig.outputs["colbert_vecs"].name) + + input_ids = np.random.randint(10, 1000, size=(batch, seqlen)).astype(dtype) + attention_mask = np.ones((batch, seqlen), dtype=dtype) + + last_h, colbert = sess.run( + [t_last, t_colbert], + feed_dict={t_input_ids: input_ids, t_attention: attention_mask} + ) + print(" - last_hidden_state:", last_h.shape, last_h.dtype) + print(" - colbert_vecs :", colbert.shape, colbert.dtype) + assert last_h.dtype == np.float32 and colbert.dtype == np.float32 + assert last_h.shape == (batch, seqlen, 1024) + print(colbert.shape) + #assert colbert.shape == (batch, seqlen, 1024) + print(" ✔ 세션 추론 성공") def main(): - # Load the model - model_path = "BAAI/bge-m3" - model_path_tf = "/workspace/BGE-M3-Model-Converter/model" - model = load_torch_model(model_path) - tokenizer = load_tokenizer(model_path) - - # Tokenize the text - text = "Hello, my dog is cute" - inputs = tokenize_wo_padding(tokenizer, text) - inputs_w_padding = tokenize_w_padding(tokenizer, text) - - # Get the output from the model - loguru.logger.info("Torch] Model output".ljust(50, "-")) - model.eval().to("cuda") - with torch.no_grad(): - inputs = {k: v.to("cuda") for k, v in inputs.items()} - inputs_w_padding = {k: v.to("cuda") for k, v in inputs_w_padding.items()} - - output = model(**inputs) - output_w_padding = model(**inputs_w_padding) - loguru.logger.info("output without padding (GT)".ljust(50, "-")) - loguru.logger.info(output['last_hidden_state'][:, 0]) - loguru.logger.info("="*50) - loguru.logger.info("output with padding".ljust(50, "-")) - loguru.logger.info(output_w_padding['last_hidden_state'][:, 0]) - loguru.logger.info("="*50) - err = torch.abs(output['last_hidden_state'][:, 0] - output_w_padding['last_hidden_state'][:, 0]) - loguru.logger.info("Error".ljust(50, "-")) - loguru.logger.info(err.mean()) - - inputs_tf = tokenize_wo_padding(tokenizer, text, return_tensors="tf") - inputs_tf_w_padding = tokenize_w_padding(tokenizer, text, return_tensors="tf") - loaded = load_tf_model(model_path_tf) - # Use the default 2-input signature - sigs = loaded.signatures - tf_model = sigs.get("serving_default") - if tf_model is None: - raise RuntimeError("serving_default signature not found") - - loguru.logger.info("Tensorflow] Model output".ljust(50, "-")) - with tf.device("/GPU:0"): - output_tf = call_signature( - tf_model, inputs_tf["input_ids"], inputs_tf["attention_mask"], None - ) - output_tf_w_padding = call_signature( - tf_model, - inputs_tf_w_padding["input_ids"], - inputs_tf_w_padding["attention_mask"], - None, - ) - loguru.logger.info("output without padding (GT)".ljust(50, "-")) - hs = output_tf['last_hidden_state'] - val_no_pad = hs[:, 0] - loguru.logger.info(val_no_pad) - loguru.logger.info("="*50) - loguru.logger.info("output with padding".ljust(50, "-")) - hsw = output_tf_w_padding['last_hidden_state'] - val_pad = hsw[:, 0] - loguru.logger.info(val_pad) - loguru.logger.info("="*50) - err_tf = tf.abs(val_no_pad - val_pad) - loguru.logger.info("Error".ljust(50, "-")) - loguru.logger.info(tf.reduce_mean(err_tf)) - loguru.logger.info("="*50) - - + ap = argparse.ArgumentParser() + ap.add_argument("--model_dir", type=str, default="converted_bge_m3_tf1_java_fixed/model", help="Path to SavedModel directory (…/model)") + args = ap.parse_args() + + inspect_signature(args.model_dir) + run_session_once(args.model_dir, dtype=np.int32, batch=2, seqlen=12) + + print("\n[3] (의도적) int64로 재실행 → 보통 dtype mismatch로 실패하거나 내부 캐스팅 없으면 에러") + try: + run_session_once(args.model_dir, dtype=np.int64, batch=2, seqlen=12) + print("※ int64 입력이 통과하면, 시그니처가 int64이거나 내부 캐스팅이 있는 경우입니다.") + except Exception as e: + print(" ✔ 기대된 실패(입력 dtype 불일치):", type(e).__name__, str(e)[:200], "…") if __name__ == "__main__": main() From c8588163bd98955972cbc871cc0551173fe51651 Mon Sep 17 00:00:00 2001 From: sigridjineth Date: Sun, 7 Sep 2025 18:41:58 +0900 Subject: [PATCH 3/3] validate colbert --- export_tf1_saved_model.py | 90 ++++++++++++++++------- model_conversion_validator.py | 133 +++++++++++++++++++++++++++++++++- 2 files changed, 196 insertions(+), 27 deletions(-) diff --git a/export_tf1_saved_model.py b/export_tf1_saved_model.py index 38cebb7..5f06da1 100644 --- a/export_tf1_saved_model.py +++ b/export_tf1_saved_model.py @@ -4,7 +4,8 @@ import torch import tensorflow as tf from transformers import AutoTokenizer -# from BGEM3TFModel_tfkeras2 import BGEM3TensorFlow +from BGEM3TFModel_tfkeras2 import BGEM3TensorFlow +from huggingface_hub import snapshot_download # NEW os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "1") @@ -28,31 +29,70 @@ def load_state_dict(model_or_path: str) -> dict: print(f"[load] pytorch_model.bin: {pt}") return torch.load(pt, map_location="cpu") -def load_colbert_weight(model_or_path: str): - try: - if os.path.isdir(model_or_path): - p = os.path.join(model_or_path, "colbert_linear.pt") - else: - from huggingface_hub import snapshot_download - local = snapshot_download(repo_id=model_or_path) - p = os.path.join(local, "colbert_linear.pt") - st = torch.load(p, map_location="cpu") - if isinstance(st, dict): - W = st.get("weight") - B = st.get("bias") - if W is None: - first_key = next(iter(st)) - W = st[first_key] - B = st.get("bias", None) +def load_colbert_weight(model_name_or_path: str): + """ + Hugging Face repo(또는 로컬 폴더)에서 colbert_linear.pt를 읽어 weight/bias를 numpy(float32)로 반환 + weight: (out_dim, in_dim), bias: (out_dim,) + """ + if os.path.isdir(model_name_or_path): + p = os.path.join(model_name_or_path, "colbert_linear.pt") + else: + local = snapshot_download(repo_id=model_name_or_path) + p = os.path.join(local, "colbert_linear.pt") + + st = torch.load(p, map_location="cpu") + if isinstance(st, dict): + W = st.get("weight") + B = st.get("bias") + if W is None: # 혹시 키 이름이 다르면 첫 텐서를 weight로 간주 + first_key = next(iter(st)) + W = st[first_key] + B = st.get("bias", None) + else: + # 리스트/튜플 또는 텐서 + if isinstance(st, (list, tuple)): + W, B = st else: - W, B = (st, None) if not isinstance(st, (list, tuple)) else st - W = _np(W) - B = _np(B) if B is not None else None - print(f"[colbert] head: out_dim={W.shape[0]}, in_dim={W.shape[1]}, bias={'yes' if B is not None else 'no'}") - return W, B - except Exception as e: - print(f"[colbert] not found ({e}); skipping)") - return None, None + W, B = st, None + + W = W.detach().cpu().numpy().astype(np.float32) + B = ( + B.detach().cpu().numpy().astype(np.float32) + if B is not None + else np.zeros((W.shape[0],), np.float32) + ) + print(f"[colbert] head: out_dim={W.shape[0]}, in_dim={W.shape[1]}, bias={'yes' if B is not None else 'no'}") + return W, B + + +def project_colbert_pt( + last_hidden_np: np.ndarray, + attention_mask_np: np.ndarray, + W: np.ndarray, + b: np.ndarray, +) -> np.ndarray: + """ + PT last_hidden_state로부터 ColBERT head 적용 결과를 계산 + last_hidden_np: (B,T,H) / attention_mask_np: (B,T) / W: (O,H) / b: (O,) + 반환: masked colbert vecs, shape (B,T-1,O) + """ + # 1) CLS 제외 + x = last_hidden_np[:, 1:, :] # (B, T-1, H) + # 2) 선형 사상: x @ W^T + b, einsum 'bth,oh->bto' + y = np.einsum('bth,oh->bto', x, W) + b[None, None, :] + # 3) 마스킹 + submask = attention_mask_np[:, 1:].astype(np.float32) # (B, T-1) + y = y * submask[:, :, None] + return y + + +def cosine_rowwise(a: np.ndarray, b: np.ndarray, eps: float = 1e-9) -> np.ndarray: + """ + 마지막 축(특징 축) 기준 코사인 유사도. a,b: (...,D) -> (...) + """ + a_n = a / (np.linalg.norm(a, axis=-1, keepdims=True) + eps) + b_n = b / (np.linalg.norm(b, axis=-1, keepdims=True) + eps) + return np.sum(a_n * b_n, axis=-1) def _to_tf_dtype(d) -> tf.dtypes.DType: try: diff --git a/model_conversion_validator.py b/model_conversion_validator.py index a054ed1..f07ff88 100644 --- a/model_conversion_validator.py +++ b/model_conversion_validator.py @@ -3,6 +3,7 @@ import torch import tensorflow as tf from transformers import AutoTokenizer, AutoModel +from huggingface_hub import snapshot_download def load_original_pytorch_model(model_name_or_path): @@ -72,7 +73,8 @@ def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128): emb = last_hidden[:, 0, :].numpy() hiddens = outputs.get("hidden_states", None) # (L+1,B,T,H) print(f'hiddens, {hiddens}') - return emb, (hiddens.numpy() if hiddens is not None else None) + colbert = outputs.get("colbert_vecs", None) + return emb, (hiddens.numpy() if hiddens is not None else None), (colbert.numpy() if colbert is not None else None) def cosine_similarity(a, b): @@ -103,6 +105,64 @@ def manual_l0_from_pt(sd, input_ids_np, attention_mask_np, padding_idx=1, eps=1e return xhat * gamma + beta # (B,T,H) +# === ColBERT helpers === +def load_colbert_weight(model_name_or_path: str): + """ + Load colbert_linear.pt from local path or HF repo and return W, b as float32 numpy arrays. + W: (out_dim, in_dim), b: (out_dim,) + """ + import os + if os.path.isdir(model_name_or_path): + p = os.path.join(model_name_or_path, "colbert_linear.pt") + else: + local = snapshot_download(repo_id=model_name_or_path) + p = os.path.join(local, "colbert_linear.pt") + + st = torch.load(p, map_location="cpu") + if isinstance(st, dict): + W = st.get("weight") + B = st.get("bias") + if W is None: + first_key = next(iter(st)) + W = st[first_key] + B = st.get("bias", None) + else: + if isinstance(st, (list, tuple)): + W, B = st + else: + W, B = st, None + + W = W.detach().cpu().numpy().astype(np.float32) + B = ( + B.detach().cpu().numpy().astype(np.float32) + if B is not None + else np.zeros((W.shape[0],), np.float32) + ) + return W, B + + +def project_colbert_pt(last_hidden_np: np.ndarray, + attention_mask_np: np.ndarray, + W: np.ndarray, + b: np.ndarray) -> np.ndarray: + """ + Apply ColBERT head on PT last_hidden_state and mask out padding. + last_hidden_np: (B,T,H), attention_mask_np: (B,T), W: (O,H), b: (O,) -> returns (B,T-1,O) + """ + x = last_hidden_np[:, 1:, :] # remove CLS + y = np.einsum('bth,oh->bto', x, W) + b[None, None, :] + submask = attention_mask_np[:, 1:].astype(np.float32) + y = y * submask[:, :, None] + return y + + +def cosine_rowwise(a: np.ndarray, b: np.ndarray, eps: float = 1e-9) -> np.ndarray: + """Cosine similarity along the last dimension.""" + a_n = a / (np.linalg.norm(a, axis=-1, keepdims=True) + eps) + b_n = b / (np.linalg.norm(b, axis=-1, keepdims=True) + eps) + return np.sum(a_n * b_n, axis=-1) + + def main(): @@ -121,7 +181,7 @@ def main(): print("=== 2) TensorFlow ===") tf_sig, tf_tok = load_converted_tf_model(tf_dir) - tf_emb, tf_layers = encode_with_tf_model(tf_sig, tf_tok, queries, max_length=128) + tf_emb, tf_layers, tf_colbert = encode_with_tf_model(tf_sig, tf_tok, queries, max_length=128) pt_l0 = pt_layers[0].detach().cpu().numpy() # (B,T,H) tf_l0 = tf_layers[0] # (B,T,H) @@ -164,6 +224,75 @@ def main(): print("Manual vs PT MSE:", np.mean((l0_manual - pt_l0) ** 2)) print("Manual vs TF MSE:", np.mean((l0_manual - tf_l0) ** 2)) + # === 4) ColBERT head validation (masked) === + try: + Wc, bc = load_colbert_weight(pt_id) + batch_pt = pt_tok(queries, padding=True, truncation=True, max_length=128, return_tensors="pt") + inputs_mask = batch_pt["attention_mask"].numpy().astype(np.int32) # (B,T) + + # PT 측 colbert 투영 + pt_last_hidden_np = pt_layers[-1].detach().cpu().numpy() # (B,T,H) + pt_colbert = project_colbert_pt(pt_last_hidden_np, inputs_mask, Wc, bc) # (B,T-1,O) + + if tf_colbert is None: + print("[ColBERT] TF colbert_vecs not present in signature; skipped") + else: + # 시간축 동기화 (이론상 T-1 동일) + min_T = min(pt_colbert.shape[1], tf_colbert.shape[1]) + ptc = pt_colbert[:, :min_T, :] + tfc = tf_colbert[:, :min_T, :] + + # 유효토큰 마스크 (CLS 제외) + valid_mask = (inputs_mask[:, 1:][:, :min_T] == 1) # (B, min_T) + # 평탄화 후 유효토큰만 선택 + pt_flat = ptc.reshape(-1, ptc.shape[-1])[valid_mask.reshape(-1)] + tf_flat = tfc.reshape(-1, tfc.shape[-1])[valid_mask.reshape(-1)] + + # 영벡터 제거(정규화시 왜곡 방지) + keep = (np.linalg.norm(pt_flat, axis=1) > 1e-12) | (np.linalg.norm(tf_flat, axis=1) > 1e-12) + pt_flat = pt_flat[keep] + tf_flat = tf_flat[keep] + + col_mse_valid = mse(pt_flat, tf_flat) + col_cos_valid = cosine_rowwise(pt_flat, tf_flat).mean() + print(f"\n[ColBERT(valid)] mse={col_mse_valid:.8f} cos={col_cos_valid:.6f}") + + # 참고: 모든 위치(패딩 포함) 지표도 함께 출력 + col_mse_all = mse(ptc, tfc) + col_cos_all = cosine_rowwise( + ptc.reshape(-1, ptc.shape[-1]), tfc.reshape(-1, tfc.shape[-1]) + ).mean() + print(f"[ColBERT(all-pos)] mse={col_mse_all:.8f} cos={col_cos_all:.6f}") + + # === 5) last_hidden_state (masked) 비교 === + pt_last = pt_layers[-1].detach().cpu().numpy() # (B,T,H) + tf_last = tf_layers[-1] # (B,T,H) + min_T2 = min(pt_last.shape[1], tf_last.shape[1]) + pt_last = pt_last[:, :min_T2, :] + tf_last = tf_last[:, :min_T2, :] + valid_mask2 = (inputs_mask[:, :min_T2] == 1) + pt_flat2 = pt_last.reshape(-1, pt_last.shape[-1])[valid_mask2.reshape(-1)] + tf_flat2 = tf_last.reshape(-1, tf_last.shape[-1])[valid_mask2.reshape(-1)] + mse_last = mse(pt_flat2, tf_flat2) + cos_last = cosine_rowwise(pt_flat2, tf_flat2).mean() + print(f"[last_hidden(valid)] mse={mse_last:.8f} cos={cos_last:.6f}") + + # === 6) Determinism check === + batch_tf = tf_tok(queries, padding=True, truncation=True, max_length=128, return_tensors="tf") + outs1 = tf_sig( + input_ids=tf.cast(batch_tf["input_ids"], tf.int32), + attention_mask=tf.cast(batch_tf["attention_mask"], tf.int32), + ) + outs2 = tf_sig( + input_ids=tf.cast(batch_tf["input_ids"], tf.int32), + attention_mask=tf.cast(batch_tf["attention_mask"], tf.int32), + ) + lh1 = outs1["last_hidden_state"].numpy() + lh2 = outs2["last_hidden_state"].numpy() + print("[determinism] max_abs_diff:", float(np.max(np.abs(lh1 - lh2)))) + except Exception as e: + print(f"[ColBERT] skipped: {e}") + if __name__ == "__main__": main()