From 5bc774d1f8e054d9590be3edafda1a18443690be Mon Sep 17 00:00:00 2001
From: sigridjineth <sigrid.jinhyung@gmail.com>
Date: Sat, 6 Sep 2025 14:36:14 +0900
Subject: [PATCH 1/3] feat: colbert_vecs

---
 BGEM3TFModel.py               | 216 ++++++++++++++++++----------------
 BGEM3WeightConverter.py       |  91 ++++++++++++--
 model_conversion_validator.py |  80 ++++++++++---
 tf_colbert_loader.py          | 101 ++++++++++++++++
 torch_tf_validator.py         |  67 ++++++++---
 5 files changed, 412 insertions(+), 143 deletions(-)
 create mode 100644 tf_colbert_loader.py

diff --git a/BGEM3TFModel.py b/BGEM3TFModel.py
index 14e8fc9..48eef84 100644
--- a/BGEM3TFModel.py
+++ b/BGEM3TFModel.py
@@ -18,25 +18,27 @@ def __init__(self, d_model, num_heads, dropout_rate=0.1, **kwargs):
         self.d_model = d_model
         self.depth = d_model // num_heads  # 각 헤드의 차원 크기
 
-        # Query, Key, Value를 위한 Dense Layer
-        self.wq = tf.keras.layers.Dense(d_model)
-        self.wk = tf.keras.layers.Dense(d_model)
-        self.wv = tf.keras.layers.Dense(d_model)
+        # Query, Key, Value를 위한 Dense Layer (stable names for SavedModel)
+        self.wq = tf.keras.layers.Dense(d_model, name="attention_wq")
+        self.wk = tf.keras.layers.Dense(d_model, name="attention_wk")
+        self.wv = tf.keras.layers.Dense(d_model, name="attention_wv")
 
         # 출력 레이어
-        self.dense = tf.keras.layers.Dense(d_model)
+        self.dense = tf.keras.layers.Dense(d_model, name="attention_output")
 
         # 어텐션 layerNorm
-        self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5)
+        self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="attn_LayerNorm")
 
         # 드롭아웃
         self.dropout = tf.keras.layers.Dropout(dropout_rate)
 
-    def stable_softmax(self, logits, axis=None, name=None):
-        """
-        Stable softmax implementation
-        """
-        return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)
+    def stable_softmax(self, logits, axis=-1, name=None):
+        """Numerically stable softmax: subtract max and compute in float32."""
+        dtype = logits.dtype
+        x = tf.cast(logits, tf.float32)
+        x = x - tf.reduce_max(x, axis=axis, keepdims=True)
+        probs = tf.nn.softmax(x, axis=axis, name=name)
+        return tf.cast(probs, dtype)
 
     def split_heads(self, x, batch_size):
         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
@@ -45,27 +47,29 @@ def split_heads(self, x, batch_size):
     def call(self, inputs, mask=None, training=False):
         batch_size = tf.shape(inputs)[0]
 
-        # Query, Key, Value를 계산
-        q = self.wq(inputs)  # (batch_size, seq_len, d_model)
-        k = self.wk(inputs)  # (batch_size, seq_len, d_model)
-        v = self.wv(inputs)  # (batch_size, seq_len, d_model)
+        # Projections
+        q = self.wq(inputs)
+        k = self.wk(inputs)
+        v = self.wv(inputs)
 
-        # 다중 헤드로 분리
-        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
-        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
-        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
+        # Split heads
+        q = self.split_heads(q, batch_size)
+        k = self.split_heads(k, batch_size)
+        v = self.split_heads(v, batch_size)
 
-        # Scaled Dot-Product Attention
-        sqrt_att_head_size = math.sqrt(self.depth)
-
-        attention_scores = tf.matmul(q, k, transpose_b=True)  # (batch_size, num_heads, seq_len_q, seq_len_k)
-        dk = tf.cast(sqrt_att_head_size, tf.float32)
-        attention_scores = tf.divide(attention_scores, dk)
+        # Scaled dot-product attention (compute in float32 for stability)
+        q_f = tf.cast(q, tf.float32)
+        k_f = tf.cast(k, tf.float32)
+        attention_scores = tf.matmul(q_f, k_f, transpose_b=True)
+        scale = tf.sqrt(tf.cast(self.depth, tf.float32))
+        attention_scores = attention_scores / scale
 
         if mask is not None:
-            attention_scores = tf.add(attention_scores, mask)
+            attention_scores = attention_scores + tf.cast(mask, tf.float32)
 
         attention_probs = self.stable_softmax(attention_scores, axis=-1)
+        # Cast back to v dtype for matmul efficiency under mixed precision
+        attention_probs = tf.cast(attention_probs, v.dtype)
         attention_probs = self.dropout(attention_probs, training=training)
 
         # Attention result
@@ -92,7 +96,8 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True,
                  colbert_dim=-1, batch_size=256, query_max_length=512,
                  passage_max_length=512, return_dense=True, return_sparse=False,
                  return_colbert_vecs=False, dropout_rate=0.1):
-        super().__init__(name="bge-m3-tensorflow")
+        # Use safe model name (no hyphen or dot) to avoid TF resource container issues
+        super().__init__(name="bge_m3_tensorflow")
 
         self.model_name = model_name
         self.normalize_embeddings = normalize_embeddings
@@ -118,11 +123,23 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True,
         self.num_layers = self.config.num_hidden_layers
         self.vocab_size = self.config.vocab_size
 
+        # Optional mixed precision
+        if self.use_fp16:
+            from tensorflow.keras import mixed_precision
+            try:
+                mixed_precision.set_global_policy("mixed_float16")
+            except Exception:
+                pass
+
         # Build components
         self._build_embeddings()
         self._build_encoder_layers()
         self._build_pooler()
+        # Handle ColBERT dim parameter
+        self.colbert_dim = self.d_model if not colbert_dim or colbert_dim < 1 else int(colbert_dim)
         self._build_colbert()
+        # Sparse head (optional)
+        self.sparse_linear = tf.keras.layers.Dense(1, name="sparse_linear")
 
         # Tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
@@ -193,7 +210,7 @@ def _build_encoder_layers(self):
                 num_heads=self.num_heads,
                 intermediate_size=self.config.intermediate_size,
                 dropout_rate=self.dropout_rate,
-                name=f"encoder.layer.{i}"
+                name=f"encoder_layer_{i}"
             )
             self.encoder_layers.append(layer)
 
@@ -203,13 +220,11 @@ def _build_pooler(self):
             self.d_model,
             activation='tanh',
             kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-            name="pooler.dense"
+            name="pooler_dense"
         )
 
     def _build_colbert(self):
-        self.colbert_linear = tf.keras.layers.Dense(
-            units=self.d_model,
-        )
+        self.colbert_linear = tf.keras.layers.Dense(self.colbert_dim, name="colbert_linear")
 
     def call(self, inputs, training=False, output_hidden_states=False):
 
@@ -225,7 +240,7 @@ def call(self, inputs, training=False, output_hidden_states=False):
         input_shape = self.shape_list(inputs_embeds)[:-1]
 
         if token_type_ids is None:
-            token_type_ids = tf.fill(dims=input_shape, value=0)
+            token_type_ids = tf.zeros_like(input_ids, dtype=tf.int32)
 
         if position_ids is None:
             if input_ids is not None:
@@ -248,18 +263,17 @@ def call(self, inputs, training=False, output_hidden_states=False):
         if training:
             embedding_output = self.dropout(embedding_output, training=training)
 
-        attention_mask_origin = attention_mask
+        # Ensure attention mask exists and is float32 for numerical stability
+        if attention_mask is None:
+            attention_mask = tf.ones_like(input_ids, dtype=tf.int32)
 
-        attention_mask_shape = self.shape_list(attention_mask)
-
-        extended_attention_mask = tf.reshape(
-            attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
-        )
+        attention_mask_origin = attention_mask
 
-        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
-        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
-        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
-        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+        B = tf.shape(input_ids)[0]
+        L = tf.shape(input_ids)[1]
+        extended_attention_mask = tf.reshape(tf.cast(attention_mask, tf.float32), (B, 1, 1, L))
+        # Large negative for masked positions (kept in float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * (-1e9)
 
         attention_mask = extended_attention_mask
 
@@ -276,30 +290,32 @@ def call(self, inputs, training=False, output_hidden_states=False):
             if output_hidden_states:
                 all_hidden_states.append(hidden_states)
 
-        # Pooling
-        if self.pooling_method == "mean":
-            pooled_output = tf.reduce_mean(hidden_states, axis=1)
-        else:  # default: cls
-            pooled_output = hidden_states[:, 0, :]
-
-        # Apply pooler if return_dense is True
-        if self.return_dense:
-            pooled_output = pooled_output
-
-        # Normalize embeddings if specified
-        if self.normalize_embeddings:
-            pooled_output = tf.nn.l2_normalize(pooled_output, axis=-1)
+        # Final last_hidden_state (B, T, H) in float32 (no pooling here)
+        last_hidden_state = tf.cast(hidden_states, tf.float32)
 
         ## colbert_vecs
-        colbert_vecs = self.colbert_linear(hidden_states[:, 1:])
-        colbert_vecs = colbert_vecs * tf.cast(attention_mask_origin[:, 1:][:, :, None], dtype=tf.float32)
+        colbert_vecs = None
+        if self.return_colbert_vecs:
+            # Compute in the native dtype (e.g., float16 under mixed precision)
+            colbert_in = hidden_states[:, 1:]
+            colbert_out = self.colbert_linear(colbert_in)
+            # Match mask dtype to colbert_out to avoid dtype mismatch in multiplication
+            m = tf.cast(attention_mask_origin[:, 1:], colbert_out.dtype)[:, :, None]
+            colbert_out = colbert_out * m
+            # Return as float32 for serving stability
+            colbert_vecs = tf.cast(colbert_out, tf.float32)
 
         outputs = {
-            "dense_vecs": pooled_output,
-            "colbert_vecs": colbert_vecs,
-            "last_hidden_state": hidden_states
+            "last_hidden_state": last_hidden_state
         }
 
+        if colbert_vecs is not None:
+            outputs["colbert_vecs"] = colbert_vecs
+
+        if self.return_sparse:
+            token_weights = tf.nn.relu(self.sparse_linear(hidden_states))
+            outputs["token_weights"] = token_weights
+
         if output_hidden_states:
             outputs["hidden_states"] = all_hidden_states
 
@@ -311,17 +327,15 @@ def __init__(self, d_model, num_heads, intermediate_size, dropout_rate=0.1, **kw
         super().__init__(**kwargs)
 
         self.attention = MultiHeadAttention(d_model, num_heads, dropout_rate)
-        self.attention_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5)
-        self.attention_dropout = tf.keras.layers.Dropout(dropout_rate)
 
         # Intermediate -> gelu_approx
         self.intermediate = tf.keras.layers.Dense(
             intermediate_size,
-            name="intermediate.dense"
+            name="intermediate_dense"
         )
-        self.output_dense = tf.keras.layers.Dense(d_model, name="output.dense")
+        self.output_dense = tf.keras.layers.Dense(d_model, name="output_dense")
         self.output_dropout = tf.keras.layers.Dropout(dropout_rate)
-        self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5)
+        self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="output_LayerNorm")
 
     def gelu_approx(self, x):
         x = tf.convert_to_tensor(x)
@@ -350,53 +364,57 @@ def call(self, x, attention_mask=None, training=False):
         return output
 
 
-def save_model_with_tokenizer(model, tokenizer, save_path):
-    """Save both model and tokenizer"""
+def save_model_with_tokenizer(model: "BGEM3TensorFlow", tokenizer, save_path: str):
+    """Export SavedModel with a single clean default signature.
+
+    inputs : int64 (input_ids, attention_mask)
+    outputs: last_hidden_state (B,T,H,float32), optional colbert_vecs (B,T-1,H,float32)
+    """
     os.makedirs(save_path, exist_ok=True)
     model_save_path = os.path.join(save_path, 'model')
-
-    # Ensure model is built by calling it with dummy inputs
-    dummy_inputs = {
-        'input_ids': tf.zeros((2, 11), dtype=tf.int32),
-        'attention_mask': tf.ones((2, 11), dtype=tf.int32)
+    # Clean previous export to avoid stale graph/variable metadata
+    try:
+        import shutil
+        if os.path.exists(model_save_path):
+            shutil.rmtree(model_save_path)
+    except Exception:
+        pass
+
+    # Build variables once
+    dummy = {
+        'input_ids': tf.zeros((2, 8), dtype=tf.int32),
+        'attention_mask': tf.ones((2, 8), dtype=tf.int32),
+        'token_type_ids': tf.zeros((2, 8), dtype=tf.int32),
     }
-    _ = model(dummy_inputs, training=False, output_hidden_states=True)
+    _ = model(dummy, training=False, output_hidden_states=False)
 
-    # Define serving signature
     @tf.function(input_signature=[
-        tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='input_ids'),
-        tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='attention_mask')
+        tf.TensorSpec([None, None], tf.int64, name='input_ids'),
+        tf.TensorSpec([None, None], tf.int64, name='attention_mask'),
     ])
-    def serving_fn(input_ids, attention_mask):
+    def serving_default(input_ids, attention_mask):
+        # Cast to int32, synthesize token_type_ids
+        ii = tf.cast(input_ids, tf.int32)
+        am = tf.cast(attention_mask, tf.int32)
+        tt = tf.zeros_like(ii)
+
+        outs = model({'input_ids': ii, 'attention_mask': am, 'token_type_ids': tt},
+                     training=False, output_hidden_states=False)
 
-        print(input_ids)
-        inputs = {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask
+        ret = {
+            'last_hidden_state': tf.cast(outs['last_hidden_state'], tf.float32)
         }
+        if 'colbert_vecs' in outs:
+            ret['colbert_vecs'] = tf.cast(outs['colbert_vecs'], tf.float32)
+        return ret
 
-        outputs = model(inputs=inputs, training=False, output_hidden_states=True)
-
-        if outputs.get('hidden_states'):
-            hidden_states = tf.stack(outputs['hidden_states'], axis=0)
-            return {
-                'dense_vecs': outputs['dense_vecs'],  # CLS Token
-                'colbert_vecs': outputs['colbert_vecs'],
-                'hidden_states': hidden_states  # (num_layers, batch, seq_len, hidden_dim)
-            }
-        else:
-            return {
-                'dense_vecs': outputs['dense_vecs'],
-            }
-
-    # Save model
+    # Save the Keras model itself with a single default signature
     tf.saved_model.save(
         model,
         model_save_path,
-        signatures={'serving_default': serving_fn}
+        signatures={'serving_default': serving_default}
     )
 
-    # Save tokenizer
     tokenizer.save_pretrained(save_path)
 
     return model_save_path
diff --git a/BGEM3WeightConverter.py b/BGEM3WeightConverter.py
index e6a26c2..8164086 100644
--- a/BGEM3WeightConverter.py
+++ b/BGEM3WeightConverter.py
@@ -16,7 +16,8 @@ def load_sparse_weights():
         raise FileNotFoundError(f"FileNotFoundError: {model_path}")
 
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    return torch.load(model_path, map_location=device, weights_only=True)
+    # Avoid weights_only for broader PyTorch compatibility
+    return torch.load(model_path, map_location=device)
 
 
 def load_colbert_weights():
@@ -55,11 +56,58 @@ def _init_colbert_weights(tf_model):
     colbert = load_colbert_weights()
     colbert_weights = colbert['weight']
     colbert_bias = colbert['bias']
+    # Convert to numpy and report shape
+    w = colbert_weights.detach().cpu().numpy() if hasattr(colbert_weights, "detach") else np.array(colbert_weights)
+    b = colbert_bias.detach().cpu().numpy() if hasattr(colbert_bias, "detach") else np.array(colbert_bias)
 
-    tf_model.colbert_linear.set_weights([
-        colbert_weights.numpy().T,
-        colbert_bias.numpy()
-    ])
+    out_dim, in_dim = w.shape  # PT: (out_dim, in_dim)
+    print(f"ColBERT head weight shape: (out_dim={out_dim}, in_dim={in_dim})")
+
+    # Ensure the Dense layer has matching units and is built
+    try:
+        current_units = getattr(tf_model.colbert_linear, "units", None)
+    except Exception:
+        current_units = None
+
+    if current_units is not None and current_units != out_dim:
+        # Units mismatch; warn. Ideally create the model with detected colbert_dim to avoid this.
+        print(f"Warning: colbert_linear units ({current_units}) != detected out_dim ({out_dim}). We will attempt to set weights and may fail.")
+
+    # Ensure variables exist. If not built yet, do a dummy call to build with correct in_dim.
+    if not getattr(tf_model.colbert_linear, "built", False):
+        dummy = tf.zeros((1, 2, in_dim), dtype=tf.float32)
+        _ = tf_model.colbert_linear(dummy)
+
+    # Set weights (kernel shape: (in_dim, out_dim))
+    tf_model.colbert_linear.set_weights([w.T, b])
+
+
+def _init_sparse_weights(tf_model):
+    """Initialize sparse head weights if available (optional)."""
+    try:
+        st = load_sparse_weights()
+    except FileNotFoundError as e:
+        print(str(e))
+        return
+
+    # Expect PyTorch shape: (out_dim=1, in_dim=hidden)
+    w_pt = st["weight"]
+    b_pt = st["bias"]
+    # Ensure numpy
+    if hasattr(w_pt, "cpu"):
+        w_np = w_pt.cpu().numpy()
+    else:
+        w_np = np.array(w_pt)
+    if hasattr(b_pt, "cpu"):
+        b_np = b_pt.cpu().numpy()
+    else:
+        b_np = np.array(b_pt)
+
+    # Build layer if not built
+    in_dim = w_np.shape[1]
+    tf_model.sparse_linear.build((None, None, in_dim))
+    # Keras Dense kernel shape: (in_dim, out_dim)
+    tf_model.sparse_linear.set_weights([w_np.T, b_np])
 
 
 class BGEM3WeightConverter:
@@ -85,15 +133,15 @@ def initialize_weights(self, tf_model):
         # Initialize encoder layers
         self._init_transformer_blocks(tf_model)
 
-        # Initialize pooler
-        self._init_pooler_weights(tf_model)
-
-        # Initialize pooler
+        # Initialize pooler (once)
         self._init_pooler_weights(tf_model)
 
         # Initialize colbert
         _init_colbert_weights(tf_model)
 
+        # Initialize sparse head (optional)
+        _init_sparse_weights(tf_model)
+
         return tf_model
 
     def _init_embedding_weights(self, tf_model):
@@ -230,9 +278,28 @@ def _init_pooler_weights(self, tf_model):
 
 
 def convert_and_save_model(model_name: str, save_path: str):
-    """Convert PyTorch model to TensorFlow and save"""
-    # Initialize TensorFlow model
-    tf_model = BGEM3TensorFlow(model_name)
+    """Convert PyTorch model to TensorFlow and save.
+    Also detects and uses original ColBERT dimension for TF head.
+    """
+    # Detect ColBERT original dimension from weights (out_dim)
+    try:
+        colbert = load_colbert_weights()
+        colbert_w = colbert['weight']
+        out_dim = int(colbert_w.shape[0])
+        print(f"Detected ColBERT dimension: {out_dim}")
+        colbert_dim = out_dim
+        return_colbert_vecs = True
+    except Exception as e:
+        print(f"ColBERT weights not found or failed to load: {e}")
+        colbert_dim = -1
+        return_colbert_vecs = False
+
+    # Initialize TensorFlow model with detected colbert_dim
+    tf_model = BGEM3TensorFlow(
+        model_name,
+        colbert_dim=colbert_dim,
+        return_colbert_vecs=return_colbert_vecs,
+    )
 
     # Convert weights
     converter = BGEM3WeightConverter(model_name)
diff --git a/model_conversion_validator.py b/model_conversion_validator.py
index 955640f..a417943 100644
--- a/model_conversion_validator.py
+++ b/model_conversion_validator.py
@@ -83,16 +83,64 @@ def load_converted_tf_model(saved_model_dir):
     """
     model_path = f"{saved_model_dir}/model"
     loaded_model = tf.saved_model.load(model_path)
-    serving_fn = loaded_model.signatures["serving_default"]
+    # Prefer a compatible signature if available
+    prefer = [
+        "serving_default",
+        "serving_int32_3in",
+        "serving_int64_3in",
+        "serving_int32_2in",
+        "serving_int64_2in",
+    ]
+    sigs = loaded_model.signatures
+    for k in prefer:
+        if k in sigs:
+            serving_fn = sigs[k]
+            break
+    else:
+        raise RuntimeError("No suitable serving signature found in SavedModel.")
 
     tokenizer = AutoTokenizer.from_pretrained(saved_model_dir)
     return serving_fn, tokenizer
 
 
+def call_signature(sig, input_ids, attention_mask, token_type_ids=None):
+    """
+    Call SavedModel signature with automatic key/dtype adaptation.
+    - Supplies only required keys
+    - Fills missing token_type_ids with zeros
+    - Casts inputs to signature dtypes
+    """
+    # structured_input_signature: (args, kwargs)
+    spec_kwargs = sig.structured_input_signature[1]
+
+    def prepare(name, value):
+        if name not in spec_kwargs:
+            return None
+        if value is None and name == "token_type_ids":
+            value = tf.zeros_like(input_ids)
+        want = spec_kwargs[name].dtype
+        if hasattr(value, "dtype") and value.dtype != want:
+            value = tf.cast(value, want)
+        return value
+
+    kwargs = {}
+    x = prepare("input_ids", input_ids)
+    if x is not None:
+        kwargs["input_ids"] = x
+    x = prepare("attention_mask", attention_mask)
+    if x is not None:
+        kwargs["attention_mask"] = x
+    x = prepare("token_type_ids", token_type_ids)
+    if x is not None:
+        kwargs["token_type_ids"] = x
+
+    return sig(**kwargs)
+
+
 def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128):
     """
     TensorFlow 모델(서빙 시그니처)로 임베딩 추출하는 함수.
-    BGEM3TensorFlow 구조상 "dense_vecs" 키에 최종 임베딩이 들어있다고 가정.
+    SavedModel은 last_hidden_state (B,T,H)만 반환하므로 CLS 풀링을 적용해 (B,H) 임베딩 생성.
     """
     inputs = tokenizer(
         queries,
@@ -102,11 +150,11 @@ def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128):
         return_tensors="tf"
     )
 
-    outputs = serving_fn(
-        input_ids=inputs["input_ids"],
-        attention_mask=inputs["attention_mask"]
-    )
-    embeddings = outputs["dense_vecs"].numpy()  # (batch_size, hidden_size)
+    token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"]))
+    outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids)
+    # Serving returns last_hidden_state (B, T, H); apply CLS pooling for embedding
+    last_hidden = outputs["last_hidden_state"]  # (B, T, H)
+    embeddings = last_hidden[:, 0, :].numpy()  # (B, H)
 
     return embeddings
 
@@ -125,15 +173,17 @@ def encode_with_tf_model_and_get_hidden_states(serving_fn, tokenizer, queries, m
         return_tensors="tf"
     )
 
-    outputs = serving_fn(
-        input_ids=inputs["input_ids"],
-        attention_mask=inputs["attention_mask"]
-    )
+    token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"]))
+    outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids)
 
-    hidden_states = outputs["hidden_states"]  # (num_layers, batch, seq_len, hidden_dim)
-    final_embeddings = outputs["dense_vecs"]
-    print("outputs['colbert_vecs'] : ")
-    print(outputs["colbert_vecs"])
+    # Only last_hidden_state is returned in serving; keep KeyError behavior for old path
+    hidden_states = outputs["hidden_states"]  # will raise KeyError (by design)
+    final_embeddings = outputs["last_hidden_state"]
+    if "colbert_vecs" in outputs:
+        print("outputs['colbert_vecs'] : ")
+        print(outputs["colbert_vecs"])
+    else:
+        print("colbert_vecs not returned by TF model (flag disabled).")
 
     return final_embeddings.numpy(), hidden_states
 
diff --git a/tf_colbert_loader.py b/tf_colbert_loader.py
new file mode 100644
index 0000000..8005891
--- /dev/null
+++ b/tf_colbert_loader.py
@@ -0,0 +1,101 @@
+import os
+import argparse
+import tensorflow as tf
+from transformers import AutoTokenizer
+
+
+def resolve_model_path(root_dir: str) -> str:
+    """Return a path that contains saved_model.pb (root or <root>/model)."""
+    cand1 = root_dir
+    cand2 = os.path.join(root_dir, "model")
+    if os.path.exists(os.path.join(cand2, "saved_model.pb")):
+        return cand2
+    if os.path.exists(os.path.join(cand1, "saved_model.pb")):
+        return cand1
+    raise FileNotFoundError(f"No SavedModel found under '{root_dir}' (checked '{cand2}' and '{cand1}')")
+
+
+def call_signature(sig, input_ids, attention_mask, token_type_ids=None):
+    """Call signature with best-effort arg set and dtype handling.
+    Tries (int64,int32) x (3-key,2-key) in order.
+    """
+    last_err = None
+    for dtype in (tf.int64, tf.int32):
+        ii = tf.cast(input_ids, dtype)
+        am = tf.cast(attention_mask, dtype)
+        # Try 3-key first
+        if token_type_ids is not None:
+            try:
+                tt = tf.cast(token_type_ids, dtype)
+                return sig(input_ids=ii, attention_mask=am, token_type_ids=tt)
+            except Exception as e:
+                last_err = e
+        # Then 2-key
+        try:
+            return sig(input_ids=ii, attention_mask=am)
+        except Exception as e:
+            last_err = e
+            continue
+    if last_err is not None:
+        raise last_err
+    raise RuntimeError("Failed to call signature with any supported argument pattern")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Load TF ColBERT SavedModel and print output shapes.")
+    parser.add_argument("--model_dir", default="./converted_bge_m3", help="Path to SavedModel root (contains tokenizer files).")
+    parser.add_argument("--max_length", type=int, default=128, help="Tokenization max length.")
+    parser.add_argument("--texts", nargs="*", default=[
+        "이 모델은 무엇을 하는 모델인가요?",
+        "bge-m3 tensorflow colbert vectors test",
+    ])
+    args = parser.parse_args()
+
+    # Load tokenizer from the same root dir
+    tok = AutoTokenizer.from_pretrained(args.model_dir)
+    inputs = tok(
+        args.texts,
+        padding=True,
+        truncation=True,
+        max_length=args.max_length,
+        return_tensors="tf",
+    )
+    # token_type_ids may be missing for XLM-R; create zeros if absent
+    if "token_type_ids" not in inputs:
+        inputs["token_type_ids"] = tf.zeros_like(inputs["input_ids"], dtype=inputs["input_ids"].dtype)
+
+    # Load SavedModel signature
+    model_path = resolve_model_path(args.model_dir)
+    loaded = tf.saved_model.load(model_path)
+    sig = loaded.signatures.get("serving_default")
+    if sig is None:
+        raise RuntimeError("serving_default signature not found")
+
+    # Call signature robustly
+    outs = None
+    try:
+        outs = call_signature(sig, inputs["input_ids"], inputs["attention_mask"], inputs.get("token_type_ids"))
+    except TypeError:
+        # Try without token_type_ids
+        outs = call_signature(sig, inputs["input_ids"], inputs["attention_mask"], None)
+
+    # Print keys and shapes
+    print("Signature outputs:")
+    for k, v in outs.items():
+        try:
+            print(f"- {k}: shape={tuple(v.shape)}, dtype={v.dtype}")
+        except Exception:
+            print(f"- {k}: <non-tensor>")
+
+    # Convenience checks
+    if "last_hidden_state" in outs:
+        print("last_hidden_state OK ->", tuple(outs["last_hidden_state"].shape))
+    if "colbert_vecs" in outs:
+        print("colbert_vecs OK ->", tuple(outs["colbert_vecs"].shape))
+    else:
+        print("colbert_vecs not present in signature outputs.")
+    # hidden_states is no longer returned in serving by design
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torch_tf_validator.py b/torch_tf_validator.py
index 0efe3ce..b745ab4 100644
--- a/torch_tf_validator.py
+++ b/torch_tf_validator.py
@@ -28,6 +28,33 @@ def tokenize_w_padding(tokenizer, text, return_tensors="pt", max_length=512):
     return tokenizer(text, padding="max_length", max_length=max_length, return_tensors=return_tensors)
 
 
+def call_signature(sig, input_ids, attention_mask, token_type_ids=None):
+    """Adapt arguments to a SavedModel signature: keys and dtypes."""
+    spec_kwargs = sig.structured_input_signature[1]
+
+    def prepare(name, value):
+        if name not in spec_kwargs:
+            return None
+        if value is None and name == "token_type_ids":
+            value = tf.zeros_like(input_ids)
+        want = spec_kwargs[name].dtype
+        if hasattr(value, "dtype") and value.dtype != want:
+            value = tf.cast(value, want)
+        return value
+
+    kwargs = {}
+    x = prepare("input_ids", input_ids)
+    if x is not None:
+        kwargs["input_ids"] = x
+    x = prepare("attention_mask", attention_mask)
+    if x is not None:
+        kwargs["attention_mask"] = x
+    x = prepare("token_type_ids", token_type_ids)
+    if x is not None:
+        kwargs["token_type_ids"] = x
+    return sig(**kwargs)
+
+
 def main():
     # Load the model
     model_path = "BAAI/bge-m3"
@@ -61,34 +88,40 @@ def main():
     
     inputs_tf = tokenize_wo_padding(tokenizer, text, return_tensors="tf")
     inputs_tf_w_padding = tokenize_w_padding(tokenizer, text, return_tensors="tf")
-    inputs_tf_w_padding_attnFixed = inputs_tf_w_padding.copy()
-    inputs_tf_w_padding_attnFixed['attention_mask'] = tf.where(inputs_tf_w_padding['attention_mask'] == 0, -9999999, 0)
-    tf_model = load_tf_model(model_path_tf).signatures["serving_default"]
+    loaded = load_tf_model(model_path_tf)
+    # Use the default 2-input signature
+    sigs = loaded.signatures
+    tf_model = sigs.get("serving_default")
+    if tf_model is None:
+        raise RuntimeError("serving_default signature not found")
 
     loguru.logger.info("Tensorflow] Model output".ljust(50, "-"))
     with tf.device("/GPU:0"):
-        output_tf = tf_model(**inputs_tf)
-        output_tf_w_padding = tf_model(**inputs_tf_w_padding)
-        output_tf_w_padding_attnFixed = tf_model(**inputs_tf_w_padding_attnFixed)
+        output_tf = call_signature(
+            tf_model, inputs_tf["input_ids"], inputs_tf["attention_mask"], None
+        )
+        output_tf_w_padding = call_signature(
+            tf_model,
+            inputs_tf_w_padding["input_ids"],
+            inputs_tf_w_padding["attention_mask"],
+            None,
+        )
         loguru.logger.info("output without padding (GT)".ljust(50, "-"))
-        loguru.logger.info(output_tf['hidden_states'][-1][:,0])
+        hs = output_tf['last_hidden_state']
+        val_no_pad = hs[:, 0]
+        loguru.logger.info(val_no_pad)
         loguru.logger.info("="*50)
         loguru.logger.info("output with padding".ljust(50, "-"))
-        loguru.logger.info(output_tf_w_padding['hidden_states'][-1][:,0])
+        hsw = output_tf_w_padding['last_hidden_state']
+        val_pad = hsw[:, 0]
+        loguru.logger.info(val_pad)
         loguru.logger.info("="*50)
-        loguru.logger.info("output with padding (attention fixed)".ljust(50, "-"))
-        loguru.logger.info(output_tf_w_padding_attnFixed['hidden_states'][-1][:,0])
-        loguru.logger.info("="*50)
-        err_tf = tf.abs(output_tf['hidden_states'][-1][:,0] - output_tf_w_padding['hidden_states'][-1][:,0])
+        err_tf = tf.abs(val_no_pad - val_pad)
         loguru.logger.info("Error".ljust(50, "-"))
         loguru.logger.info(tf.reduce_mean(err_tf))
         loguru.logger.info("="*50)
-        err_tf_attnFixed = tf.abs(output_tf_w_padding['hidden_states'][-1][:,0] - output_tf_w_padding_attnFixed['hidden_states'][-1][:,0])
-        loguru.logger.info("Error (attention fixed)".ljust(50, "-"))
-        loguru.logger.info(tf.reduce_mean(err_tf_attnFixed))
-        loguru.logger.info("="*50)
         
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 2496b49e1750dd1c3fc6f37e4f6a884c86601c13 Mon Sep 17 00:00:00 2001
From: sigridjineth <sigrid.jinhyung@gmail.com>
Date: Sun, 7 Sep 2025 18:13:11 +0900
Subject: [PATCH 2/3] fix

---
 BGEM3TFModel.py               | 216 ++++++++--------
 BGEM3TFModel_tfkeras2.py      | 246 ++++++++++++++++++
 BGEM3WeightConverter.py       |  91 +------
 export_tf1_saved_model.py     | 369 +++++++++++++++++++++++++++
 model_conversion_validator.py | 465 +++++++++-------------------------
 tf1_session_validator.py      |  39 +++
 tf1_validator.py              | 165 ++++++++++++
 tf_colbert_loader.py          |   2 +-
 torch_tf_validator.py         | 198 ++++++---------
 9 files changed, 1128 insertions(+), 663 deletions(-)
 create mode 100644 BGEM3TFModel_tfkeras2.py
 create mode 100644 export_tf1_saved_model.py
 create mode 100644 tf1_session_validator.py
 create mode 100644 tf1_validator.py

diff --git a/BGEM3TFModel.py b/BGEM3TFModel.py
index 48eef84..14e8fc9 100644
--- a/BGEM3TFModel.py
+++ b/BGEM3TFModel.py
@@ -18,27 +18,25 @@ def __init__(self, d_model, num_heads, dropout_rate=0.1, **kwargs):
         self.d_model = d_model
         self.depth = d_model // num_heads  # 각 헤드의 차원 크기
 
-        # Query, Key, Value를 위한 Dense Layer (stable names for SavedModel)
-        self.wq = tf.keras.layers.Dense(d_model, name="attention_wq")
-        self.wk = tf.keras.layers.Dense(d_model, name="attention_wk")
-        self.wv = tf.keras.layers.Dense(d_model, name="attention_wv")
+        # Query, Key, Value를 위한 Dense Layer
+        self.wq = tf.keras.layers.Dense(d_model)
+        self.wk = tf.keras.layers.Dense(d_model)
+        self.wv = tf.keras.layers.Dense(d_model)
 
         # 출력 레이어
-        self.dense = tf.keras.layers.Dense(d_model, name="attention_output")
+        self.dense = tf.keras.layers.Dense(d_model)
 
         # 어텐션 layerNorm
-        self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="attn_LayerNorm")
+        self.attlayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-5)
 
         # 드롭아웃
         self.dropout = tf.keras.layers.Dropout(dropout_rate)
 
-    def stable_softmax(self, logits, axis=-1, name=None):
-        """Numerically stable softmax: subtract max and compute in float32."""
-        dtype = logits.dtype
-        x = tf.cast(logits, tf.float32)
-        x = x - tf.reduce_max(x, axis=axis, keepdims=True)
-        probs = tf.nn.softmax(x, axis=axis, name=name)
-        return tf.cast(probs, dtype)
+    def stable_softmax(self, logits, axis=None, name=None):
+        """
+        Stable softmax implementation
+        """
+        return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)
 
     def split_heads(self, x, batch_size):
         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
@@ -47,29 +45,27 @@ def split_heads(self, x, batch_size):
     def call(self, inputs, mask=None, training=False):
         batch_size = tf.shape(inputs)[0]
 
-        # Projections
-        q = self.wq(inputs)
-        k = self.wk(inputs)
-        v = self.wv(inputs)
+        # Query, Key, Value를 계산
+        q = self.wq(inputs)  # (batch_size, seq_len, d_model)
+        k = self.wk(inputs)  # (batch_size, seq_len, d_model)
+        v = self.wv(inputs)  # (batch_size, seq_len, d_model)
 
-        # Split heads
-        q = self.split_heads(q, batch_size)
-        k = self.split_heads(k, batch_size)
-        v = self.split_heads(v, batch_size)
+        # 다중 헤드로 분리
+        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
+        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
+        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
 
-        # Scaled dot-product attention (compute in float32 for stability)
-        q_f = tf.cast(q, tf.float32)
-        k_f = tf.cast(k, tf.float32)
-        attention_scores = tf.matmul(q_f, k_f, transpose_b=True)
-        scale = tf.sqrt(tf.cast(self.depth, tf.float32))
-        attention_scores = attention_scores / scale
+        # Scaled Dot-Product Attention
+        sqrt_att_head_size = math.sqrt(self.depth)
+
+        attention_scores = tf.matmul(q, k, transpose_b=True)  # (batch_size, num_heads, seq_len_q, seq_len_k)
+        dk = tf.cast(sqrt_att_head_size, tf.float32)
+        attention_scores = tf.divide(attention_scores, dk)
 
         if mask is not None:
-            attention_scores = attention_scores + tf.cast(mask, tf.float32)
+            attention_scores = tf.add(attention_scores, mask)
 
         attention_probs = self.stable_softmax(attention_scores, axis=-1)
-        # Cast back to v dtype for matmul efficiency under mixed precision
-        attention_probs = tf.cast(attention_probs, v.dtype)
         attention_probs = self.dropout(attention_probs, training=training)
 
         # Attention result
@@ -96,8 +92,7 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True,
                  colbert_dim=-1, batch_size=256, query_max_length=512,
                  passage_max_length=512, return_dense=True, return_sparse=False,
                  return_colbert_vecs=False, dropout_rate=0.1):
-        # Use safe model name (no hyphen or dot) to avoid TF resource container issues
-        super().__init__(name="bge_m3_tensorflow")
+        super().__init__(name="bge-m3-tensorflow")
 
         self.model_name = model_name
         self.normalize_embeddings = normalize_embeddings
@@ -123,23 +118,11 @@ def __init__(self, model_name, normalize_embeddings=False, use_fp16=True,
         self.num_layers = self.config.num_hidden_layers
         self.vocab_size = self.config.vocab_size
 
-        # Optional mixed precision
-        if self.use_fp16:
-            from tensorflow.keras import mixed_precision
-            try:
-                mixed_precision.set_global_policy("mixed_float16")
-            except Exception:
-                pass
-
         # Build components
         self._build_embeddings()
         self._build_encoder_layers()
         self._build_pooler()
-        # Handle ColBERT dim parameter
-        self.colbert_dim = self.d_model if not colbert_dim or colbert_dim < 1 else int(colbert_dim)
         self._build_colbert()
-        # Sparse head (optional)
-        self.sparse_linear = tf.keras.layers.Dense(1, name="sparse_linear")
 
         # Tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
@@ -210,7 +193,7 @@ def _build_encoder_layers(self):
                 num_heads=self.num_heads,
                 intermediate_size=self.config.intermediate_size,
                 dropout_rate=self.dropout_rate,
-                name=f"encoder_layer_{i}"
+                name=f"encoder.layer.{i}"
             )
             self.encoder_layers.append(layer)
 
@@ -220,11 +203,13 @@ def _build_pooler(self):
             self.d_model,
             activation='tanh',
             kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-            name="pooler_dense"
+            name="pooler.dense"
         )
 
     def _build_colbert(self):
-        self.colbert_linear = tf.keras.layers.Dense(self.colbert_dim, name="colbert_linear")
+        self.colbert_linear = tf.keras.layers.Dense(
+            units=self.d_model,
+        )
 
     def call(self, inputs, training=False, output_hidden_states=False):
 
@@ -240,7 +225,7 @@ def call(self, inputs, training=False, output_hidden_states=False):
         input_shape = self.shape_list(inputs_embeds)[:-1]
 
         if token_type_ids is None:
-            token_type_ids = tf.zeros_like(input_ids, dtype=tf.int32)
+            token_type_ids = tf.fill(dims=input_shape, value=0)
 
         if position_ids is None:
             if input_ids is not None:
@@ -263,17 +248,18 @@ def call(self, inputs, training=False, output_hidden_states=False):
         if training:
             embedding_output = self.dropout(embedding_output, training=training)
 
-        # Ensure attention mask exists and is float32 for numerical stability
-        if attention_mask is None:
-            attention_mask = tf.ones_like(input_ids, dtype=tf.int32)
-
         attention_mask_origin = attention_mask
 
-        B = tf.shape(input_ids)[0]
-        L = tf.shape(input_ids)[1]
-        extended_attention_mask = tf.reshape(tf.cast(attention_mask, tf.float32), (B, 1, 1, L))
-        # Large negative for masked positions (kept in float32)
-        extended_attention_mask = (1.0 - extended_attention_mask) * (-1e9)
+        attention_mask_shape = self.shape_list(attention_mask)
+
+        extended_attention_mask = tf.reshape(
+            attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+        )
+
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
 
         attention_mask = extended_attention_mask
 
@@ -290,32 +276,30 @@ def call(self, inputs, training=False, output_hidden_states=False):
             if output_hidden_states:
                 all_hidden_states.append(hidden_states)
 
-        # Final last_hidden_state (B, T, H) in float32 (no pooling here)
-        last_hidden_state = tf.cast(hidden_states, tf.float32)
+        # Pooling
+        if self.pooling_method == "mean":
+            pooled_output = tf.reduce_mean(hidden_states, axis=1)
+        else:  # default: cls
+            pooled_output = hidden_states[:, 0, :]
+
+        # Apply pooler if return_dense is True
+        if self.return_dense:
+            pooled_output = pooled_output
+
+        # Normalize embeddings if specified
+        if self.normalize_embeddings:
+            pooled_output = tf.nn.l2_normalize(pooled_output, axis=-1)
 
         ## colbert_vecs
-        colbert_vecs = None
-        if self.return_colbert_vecs:
-            # Compute in the native dtype (e.g., float16 under mixed precision)
-            colbert_in = hidden_states[:, 1:]
-            colbert_out = self.colbert_linear(colbert_in)
-            # Match mask dtype to colbert_out to avoid dtype mismatch in multiplication
-            m = tf.cast(attention_mask_origin[:, 1:], colbert_out.dtype)[:, :, None]
-            colbert_out = colbert_out * m
-            # Return as float32 for serving stability
-            colbert_vecs = tf.cast(colbert_out, tf.float32)
+        colbert_vecs = self.colbert_linear(hidden_states[:, 1:])
+        colbert_vecs = colbert_vecs * tf.cast(attention_mask_origin[:, 1:][:, :, None], dtype=tf.float32)
 
         outputs = {
-            "last_hidden_state": last_hidden_state
+            "dense_vecs": pooled_output,
+            "colbert_vecs": colbert_vecs,
+            "last_hidden_state": hidden_states
         }
 
-        if colbert_vecs is not None:
-            outputs["colbert_vecs"] = colbert_vecs
-
-        if self.return_sparse:
-            token_weights = tf.nn.relu(self.sparse_linear(hidden_states))
-            outputs["token_weights"] = token_weights
-
         if output_hidden_states:
             outputs["hidden_states"] = all_hidden_states
 
@@ -327,15 +311,17 @@ def __init__(self, d_model, num_heads, intermediate_size, dropout_rate=0.1, **kw
         super().__init__(**kwargs)
 
         self.attention = MultiHeadAttention(d_model, num_heads, dropout_rate)
+        self.attention_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5)
+        self.attention_dropout = tf.keras.layers.Dropout(dropout_rate)
 
         # Intermediate -> gelu_approx
         self.intermediate = tf.keras.layers.Dense(
             intermediate_size,
-            name="intermediate_dense"
+            name="intermediate.dense"
         )
-        self.output_dense = tf.keras.layers.Dense(d_model, name="output_dense")
+        self.output_dense = tf.keras.layers.Dense(d_model, name="output.dense")
         self.output_dropout = tf.keras.layers.Dropout(dropout_rate)
-        self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="output_LayerNorm")
+        self.output_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5)
 
     def gelu_approx(self, x):
         x = tf.convert_to_tensor(x)
@@ -364,57 +350,53 @@ def call(self, x, attention_mask=None, training=False):
         return output
 
 
-def save_model_with_tokenizer(model: "BGEM3TensorFlow", tokenizer, save_path: str):
-    """Export SavedModel with a single clean default signature.
-
-    inputs : int64 (input_ids, attention_mask)
-    outputs: last_hidden_state (B,T,H,float32), optional colbert_vecs (B,T-1,H,float32)
-    """
+def save_model_with_tokenizer(model, tokenizer, save_path):
+    """Save both model and tokenizer"""
     os.makedirs(save_path, exist_ok=True)
     model_save_path = os.path.join(save_path, 'model')
-    # Clean previous export to avoid stale graph/variable metadata
-    try:
-        import shutil
-        if os.path.exists(model_save_path):
-            shutil.rmtree(model_save_path)
-    except Exception:
-        pass
-
-    # Build variables once
-    dummy = {
-        'input_ids': tf.zeros((2, 8), dtype=tf.int32),
-        'attention_mask': tf.ones((2, 8), dtype=tf.int32),
-        'token_type_ids': tf.zeros((2, 8), dtype=tf.int32),
+
+    # Ensure model is built by calling it with dummy inputs
+    dummy_inputs = {
+        'input_ids': tf.zeros((2, 11), dtype=tf.int32),
+        'attention_mask': tf.ones((2, 11), dtype=tf.int32)
     }
-    _ = model(dummy, training=False, output_hidden_states=False)
+    _ = model(dummy_inputs, training=False, output_hidden_states=True)
 
+    # Define serving signature
     @tf.function(input_signature=[
-        tf.TensorSpec([None, None], tf.int64, name='input_ids'),
-        tf.TensorSpec([None, None], tf.int64, name='attention_mask'),
+        tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='input_ids'),
+        tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='attention_mask')
     ])
-    def serving_default(input_ids, attention_mask):
-        # Cast to int32, synthesize token_type_ids
-        ii = tf.cast(input_ids, tf.int32)
-        am = tf.cast(attention_mask, tf.int32)
-        tt = tf.zeros_like(ii)
-
-        outs = model({'input_ids': ii, 'attention_mask': am, 'token_type_ids': tt},
-                     training=False, output_hidden_states=False)
+    def serving_fn(input_ids, attention_mask):
 
-        ret = {
-            'last_hidden_state': tf.cast(outs['last_hidden_state'], tf.float32)
+        print(input_ids)
+        inputs = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask
         }
-        if 'colbert_vecs' in outs:
-            ret['colbert_vecs'] = tf.cast(outs['colbert_vecs'], tf.float32)
-        return ret
 
-    # Save the Keras model itself with a single default signature
+        outputs = model(inputs=inputs, training=False, output_hidden_states=True)
+
+        if outputs.get('hidden_states'):
+            hidden_states = tf.stack(outputs['hidden_states'], axis=0)
+            return {
+                'dense_vecs': outputs['dense_vecs'],  # CLS Token
+                'colbert_vecs': outputs['colbert_vecs'],
+                'hidden_states': hidden_states  # (num_layers, batch, seq_len, hidden_dim)
+            }
+        else:
+            return {
+                'dense_vecs': outputs['dense_vecs'],
+            }
+
+    # Save model
     tf.saved_model.save(
         model,
         model_save_path,
-        signatures={'serving_default': serving_default}
+        signatures={'serving_default': serving_fn}
     )
 
+    # Save tokenizer
     tokenizer.save_pretrained(save_path)
 
     return model_save_path
diff --git a/BGEM3TFModel_tfkeras2.py b/BGEM3TFModel_tfkeras2.py
new file mode 100644
index 0000000..cb538b2
--- /dev/null
+++ b/BGEM3TFModel_tfkeras2.py
@@ -0,0 +1,246 @@
+from typing import Dict, List, Union
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers, Model
+
+def gelu_exact(x):
+    # exact erf-based GELU
+    return tf.nn.gelu(x, approximate=False)
+
+# --- 맨 위에 추가 ---
+class TorchLayerNorm(layers.Layer):
+    def __init__(self, hidden_size: int, eps: float = 1e-5, name: str = None, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.hidden_size = hidden_size
+        self.eps = eps
+
+    def build(self, input_shape):
+        self.gamma = self.add_weight(name="gamma", shape=(self.hidden_size,), initializer="ones", dtype=tf.float32, trainable=True)
+        self.beta = self.add_weight(name="beta",  shape=(self.hidden_size,), initializer="zeros", dtype=tf.float32, trainable=True)
+        super().build(input_shape)
+
+    def call(self, x):
+        # PyTorch LayerNorm과 동일: 모집단 분산(mean of squares) 사용
+        mean = tf.reduce_mean(x, axis=-1, keepdims=True)
+        var  = tf.reduce_mean(tf.square(x - mean), axis=-1, keepdims=True)
+        xhat = (x - mean) / tf.sqrt(var + self.eps)
+        return xhat * self.gamma + self.beta
+
+
+class TorchLayerNormTF1:
+    def __init__(self, hidden_size: int, eps: float = 1e-5, scope: str = "LayerNorm"):
+        self.hidden_size = hidden_size
+        self.eps = eps
+        with tf.compat.v1.variable_scope(scope, reuse=tf.compat.v1.AUTO_REUSE):
+            self.gamma = tf.compat.v1.get_variable(
+                "weight",
+                shape=[hidden_size],
+                initializer=tf.compat.v1.initializers.ones(),
+                dtype=tf.float32,
+            )
+            self.beta = tf.compat.v1.get_variable(
+                "bias",
+                shape=[hidden_size],
+                initializer=tf.compat.v1.initializers.zeros(),
+                dtype=tf.float32,
+            )
+
+    def __call__(self, x):
+        mean = tf.reduce_mean(x, axis=-1, keepdims=True)
+        var = tf.reduce_mean(tf.square(x - mean), axis=-1, keepdims=True)
+        xhat = (x - mean) / tf.sqrt(var + self.eps)
+        return xhat * self.gamma + self.beta
+
+
+class MultiHeadAttention(layers.Layer):
+    def __init__(self, d_model: int, num_heads: int, dropout_rate: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        if d_model % num_heads != 0:
+            raise ValueError(f"d_model ({d_model}) must be divisible by num_heads ({num_heads})")
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.depth = d_model // num_heads
+
+        self.wq = layers.Dense(d_model, use_bias=True, name=f"{self.name}.wq")
+        self.wk = layers.Dense(d_model, use_bias=True, name=f"{self.name}.wk")
+        self.wv = layers.Dense(d_model, use_bias=True, name=f"{self.name}.wv")
+        self.dense = layers.Dense(d_model, use_bias=True, name=f"{self.name}.dense")
+
+        self.attlayerNorm = TorchLayerNormTF1(self.d_model, eps=1e-5, scope=f"{self.name}.attlayerNorm")
+        self.dropout = layers.Dropout(rate=dropout_rate)
+
+    def split_heads(self, x, batch_size):
+        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def call(self, inputs, mask=None, training=False):
+        bsz = tf.shape(inputs)[0]
+        q = self.split_heads(self.wq(inputs), bsz)
+        k = self.split_heads(self.wk(inputs), bsz)
+        v = self.split_heads(self.wv(inputs), bsz)
+
+        dk = tf.cast(self.depth, tf.float32)
+        attn_scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(dk)
+        if mask is not None:
+            attn_scores = attn_scores + mask
+
+        # stable softmax (+1e-9) to mirror the working code
+        attn_probs = tf.nn.softmax(attn_scores + 1e-9, axis=-1)
+        attn_probs = self.dropout(attn_probs, training=training)
+
+        ctx = tf.matmul(attn_probs, v)
+        ctx = tf.transpose(ctx, perm=[0, 2, 1, 3])
+        ctx = tf.reshape(ctx, (bsz, -1, self.d_model))
+
+        out = self.dense(ctx)
+        if training:
+            out = self.dropout(out, training=training)
+        out = self.attlayerNorm(out + inputs)
+        return out
+
+class TransformerBlock(layers.Layer):
+    def __init__(self, d_model: int, num_heads: int, intermediate_size: int, dropout_rate: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = MultiHeadAttention(d_model, num_heads, dropout_rate, name=f"{self.name}.attention")
+        self.intermediate = layers.Dense(intermediate_size, use_bias=True, name=f"{self.name}.intermediate.dense")
+        self.output_dense = layers.Dense(d_model, use_bias=True, name=f"{self.name}.output.dense")
+        self.output_dropout = layers.Dropout(dropout_rate)
+        self.output_norm = TorchLayerNormTF1(d_model, eps=1e-5, scope=f"{self.name}.output.LayerNorm")
+
+    def call(self, x, attention_mask=None, training=False):
+        x_att = self.attention(x, mask=attention_mask, training=training)
+        inter = self.intermediate(x_att)
+        inter = gelu_exact(inter)
+        out = self.output_dense(inter)
+        if training:
+            out = self.output_dropout(out, training=training)
+        x_out = self.output_norm(out + x_att)
+        return x_out
+
+class BGEM3TensorFlow(Model):
+    def __init__(
+        self,
+        vocab_size: int = 250002,
+        max_position_embeddings: int = 8194,
+        type_vocab_size: int = 1,
+        hidden_size: int = 1024,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 16,
+        intermediate_size: int = 4096,
+        dropout_rate: float = 0.0,
+        name: str = "bge-m3-tensorflow",
+    ):
+        super().__init__(name=name)
+        self.padding_idx = 1
+        self.hidden_size = hidden_size
+        self.num_layers = num_hidden_layers
+
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="embeddings", shape=[vocab_size, hidden_size],
+                initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), dtype=tf.float32
+            )
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings", shape=[max_position_embeddings, hidden_size],
+                initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), dtype=tf.float32
+            )
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings", shape=[type_vocab_size, hidden_size],
+                initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), dtype=tf.float32
+            )
+
+        # 이름은 HF와 다를 수 있어도 상관없음. 가중치만 정확히 들어가면 됩니다.
+        self.layerNorm = TorchLayerNormTF1(hidden_size, eps=1e-5, scope="embeddings.LayerNorm")
+        self.dropout = layers.Dropout(rate=dropout_rate)
+
+        self.encoder_layers: List[TransformerBlock] = []
+        for i in range(num_hidden_layers):
+            self.encoder_layers.append(
+                TransformerBlock(
+                    d_model=hidden_size,
+                    num_heads=num_attention_heads,
+                    intermediate_size=intermediate_size,
+                    dropout_rate=dropout_rate,
+                    name=f"encoder.layer.{i}",
+                )
+            )
+
+        # pooler는 내보내기 시점 디버그용으로만 빌드. dense_vecs는 raw CLS 반환.
+        self.pooler = layers.Dense(hidden_size, activation="tanh", use_bias=True, name="pooler.dense")
+        self.colbert_linear = layers.Dense(units=hidden_size, use_bias=True, name="colbert_linear")
+
+    # 경고 제거용
+    def build(self, input_shape):
+        self.built = True
+
+    @staticmethod
+    def _create_position_ids_from_attention_mask(attention_mask, padding_idx=1, past_key_values_length=0):
+        # attention_mask: [B,T] 0/1
+        mask = tf.cast(attention_mask, tf.int32)
+        incremental = tf.math.cumsum(mask, axis=1)
+        if past_key_values_length != 0:
+            incremental = incremental + tf.cast(past_key_values_length, tf.int32)
+        # pads -> padding_idx, tokens -> cumsum + padding_idx
+        return incremental * mask + tf.cast(padding_idx, tf.int32)
+
+    @staticmethod
+    def _shape_list(t: Union[tf.Tensor, np.ndarray]) -> List[int]:
+        if isinstance(t, np.ndarray):
+            return list(t.shape)
+        dynamic = tf.shape(t)
+        static = t.shape.as_list()
+        return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+
+    # === HF 로직과 완전히 동일하게 다시 작성 (형/연산 순서 포함) ===
+    @staticmethod
+    def _create_position_ids_from_input_ids(input_ids, padding_idx=1, past_key_values_length=0):
+        # Replace non-padding symbols with their position numbers.
+        # Position numbers begin at padding_idx + 1.
+        mask = tf.cast(tf.not_equal(input_ids, padding_idx), dtype=tf.int32)
+        incremental = tf.math.cumsum(mask, axis=1)
+        if past_key_values_length != 0:
+            incremental = incremental + tf.cast(past_key_values_length, tf.int32)
+        incremental = incremental * mask + tf.cast(padding_idx, tf.int32)
+        return tf.cast(incremental, dtype=input_ids.dtype)
+
+    def call(self, inputs: Dict[str, tf.Tensor], training=False, output_hidden_states: bool = False):
+        input_ids = tf.cast(inputs["input_ids"], tf.int32)
+        attention_mask = tf.cast(inputs["attention_mask"], tf.int32)
+
+        inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+        pos_ids = self._create_position_ids_from_attention_mask(attention_mask, padding_idx=self.padding_idx)
+        pos_ids = tf.cast(pos_ids, dtype=input_ids.dtype)
+        pos_embeds = tf.gather(params=self.position_embeddings, indices=pos_ids)
+
+        token_type_ids = tf.zeros_like(input_ids)
+        tok_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+
+        hidden_states = inputs_embeds + pos_embeds + tok_type_embeds
+        hidden_states = self.layerNorm(hidden_states)
+        if training:
+            hidden_states = self.dropout(hidden_states, training=training)
+
+        bsz, _, _ = self._shape_list(hidden_states)
+        ext_mask = tf.reshape(attention_mask, (bsz, 1, 1, -1))
+        ext_mask = tf.cast(ext_mask, dtype=hidden_states.dtype)
+        ext_mask = (1.0 - ext_mask) * tf.constant(-10000.0, dtype=hidden_states.dtype)
+
+        all_h = []
+        if output_hidden_states:
+            all_h.append(hidden_states)
+
+        for layer in self.encoder_layers:
+            hidden_states = layer(hidden_states, attention_mask=ext_mask, training=training)
+            if output_hidden_states:
+                all_h.append(hidden_states)
+
+        last_hidden_state = hidden_states
+        colbert_vecs = self.colbert_linear(last_hidden_state[:, 1:])
+        colbert_vecs = colbert_vecs * tf.cast(attention_mask[:, 1:][:, :, None], dtype=tf.float32)
+
+        out = {"last_hidden_state": last_hidden_state, "colbert_vecs": colbert_vecs}
+        if output_hidden_states:
+            out["hidden_states"] = tf.stack(all_h, axis=0)
+        return out
diff --git a/BGEM3WeightConverter.py b/BGEM3WeightConverter.py
index 8164086..e6a26c2 100644
--- a/BGEM3WeightConverter.py
+++ b/BGEM3WeightConverter.py
@@ -16,8 +16,7 @@ def load_sparse_weights():
         raise FileNotFoundError(f"FileNotFoundError: {model_path}")
 
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    # Avoid weights_only for broader PyTorch compatibility
-    return torch.load(model_path, map_location=device)
+    return torch.load(model_path, map_location=device, weights_only=True)
 
 
 def load_colbert_weights():
@@ -56,58 +55,11 @@ def _init_colbert_weights(tf_model):
     colbert = load_colbert_weights()
     colbert_weights = colbert['weight']
     colbert_bias = colbert['bias']
-    # Convert to numpy and report shape
-    w = colbert_weights.detach().cpu().numpy() if hasattr(colbert_weights, "detach") else np.array(colbert_weights)
-    b = colbert_bias.detach().cpu().numpy() if hasattr(colbert_bias, "detach") else np.array(colbert_bias)
 
-    out_dim, in_dim = w.shape  # PT: (out_dim, in_dim)
-    print(f"ColBERT head weight shape: (out_dim={out_dim}, in_dim={in_dim})")
-
-    # Ensure the Dense layer has matching units and is built
-    try:
-        current_units = getattr(tf_model.colbert_linear, "units", None)
-    except Exception:
-        current_units = None
-
-    if current_units is not None and current_units != out_dim:
-        # Units mismatch; warn. Ideally create the model with detected colbert_dim to avoid this.
-        print(f"Warning: colbert_linear units ({current_units}) != detected out_dim ({out_dim}). We will attempt to set weights and may fail.")
-
-    # Ensure variables exist. If not built yet, do a dummy call to build with correct in_dim.
-    if not getattr(tf_model.colbert_linear, "built", False):
-        dummy = tf.zeros((1, 2, in_dim), dtype=tf.float32)
-        _ = tf_model.colbert_linear(dummy)
-
-    # Set weights (kernel shape: (in_dim, out_dim))
-    tf_model.colbert_linear.set_weights([w.T, b])
-
-
-def _init_sparse_weights(tf_model):
-    """Initialize sparse head weights if available (optional)."""
-    try:
-        st = load_sparse_weights()
-    except FileNotFoundError as e:
-        print(str(e))
-        return
-
-    # Expect PyTorch shape: (out_dim=1, in_dim=hidden)
-    w_pt = st["weight"]
-    b_pt = st["bias"]
-    # Ensure numpy
-    if hasattr(w_pt, "cpu"):
-        w_np = w_pt.cpu().numpy()
-    else:
-        w_np = np.array(w_pt)
-    if hasattr(b_pt, "cpu"):
-        b_np = b_pt.cpu().numpy()
-    else:
-        b_np = np.array(b_pt)
-
-    # Build layer if not built
-    in_dim = w_np.shape[1]
-    tf_model.sparse_linear.build((None, None, in_dim))
-    # Keras Dense kernel shape: (in_dim, out_dim)
-    tf_model.sparse_linear.set_weights([w_np.T, b_np])
+    tf_model.colbert_linear.set_weights([
+        colbert_weights.numpy().T,
+        colbert_bias.numpy()
+    ])
 
 
 class BGEM3WeightConverter:
@@ -133,15 +85,15 @@ def initialize_weights(self, tf_model):
         # Initialize encoder layers
         self._init_transformer_blocks(tf_model)
 
-        # Initialize pooler (once)
+        # Initialize pooler
+        self._init_pooler_weights(tf_model)
+
+        # Initialize pooler
         self._init_pooler_weights(tf_model)
 
         # Initialize colbert
         _init_colbert_weights(tf_model)
 
-        # Initialize sparse head (optional)
-        _init_sparse_weights(tf_model)
-
         return tf_model
 
     def _init_embedding_weights(self, tf_model):
@@ -278,28 +230,9 @@ def _init_pooler_weights(self, tf_model):
 
 
 def convert_and_save_model(model_name: str, save_path: str):
-    """Convert PyTorch model to TensorFlow and save.
-    Also detects and uses original ColBERT dimension for TF head.
-    """
-    # Detect ColBERT original dimension from weights (out_dim)
-    try:
-        colbert = load_colbert_weights()
-        colbert_w = colbert['weight']
-        out_dim = int(colbert_w.shape[0])
-        print(f"Detected ColBERT dimension: {out_dim}")
-        colbert_dim = out_dim
-        return_colbert_vecs = True
-    except Exception as e:
-        print(f"ColBERT weights not found or failed to load: {e}")
-        colbert_dim = -1
-        return_colbert_vecs = False
-
-    # Initialize TensorFlow model with detected colbert_dim
-    tf_model = BGEM3TensorFlow(
-        model_name,
-        colbert_dim=colbert_dim,
-        return_colbert_vecs=return_colbert_vecs,
-    )
+    """Convert PyTorch model to TensorFlow and save"""
+    # Initialize TensorFlow model
+    tf_model = BGEM3TensorFlow(model_name)
 
     # Convert weights
     converter = BGEM3WeightConverter(model_name)
diff --git a/export_tf1_saved_model.py b/export_tf1_saved_model.py
new file mode 100644
index 0000000..38cebb7
--- /dev/null
+++ b/export_tf1_saved_model.py
@@ -0,0 +1,369 @@
+import os
+import argparse
+import numpy as np
+import torch
+import tensorflow as tf
+from transformers import AutoTokenizer
+# from BGEM3TFModel_tfkeras2 import BGEM3TensorFlow
+
+os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "1")
+
+def _np(t) -> np.ndarray:
+    if hasattr(t, "detach"):
+        t = t.detach()
+    if hasattr(t, "cpu"):
+        t = t.cpu()
+    arr = np.array(t)
+    if arr.dtype != np.float32:
+        arr = arr.astype(np.float32)
+    return arr
+
+def load_state_dict(model_or_path: str) -> dict:
+    if os.path.isdir(model_or_path):
+        pt = os.path.join(model_or_path, "pytorch_model.bin")
+    else:
+        from huggingface_hub import snapshot_download
+        local = snapshot_download(repo_id=model_or_path)
+        pt = os.path.join(local, "pytorch_model.bin")
+    print(f"[load] pytorch_model.bin: {pt}")
+    return torch.load(pt, map_location="cpu")
+
+def load_colbert_weight(model_or_path: str):
+    try:
+        if os.path.isdir(model_or_path):
+            p = os.path.join(model_or_path, "colbert_linear.pt")
+        else:
+            from huggingface_hub import snapshot_download
+            local = snapshot_download(repo_id=model_or_path)
+            p = os.path.join(local, "colbert_linear.pt")
+        st = torch.load(p, map_location="cpu")
+        if isinstance(st, dict):
+            W = st.get("weight")
+            B = st.get("bias")
+            if W is None:
+                first_key = next(iter(st))
+                W = st[first_key]
+                B = st.get("bias", None)
+        else:
+            W, B = (st, None) if not isinstance(st, (list, tuple)) else st
+        W = _np(W)
+        B = _np(B) if B is not None else None
+        print(f"[colbert] head: out_dim={W.shape[0]}, in_dim={W.shape[1]}, bias={'yes' if B is not None else 'no'}")
+        return W, B
+    except Exception as e:
+        print(f"[colbert] not found ({e}); skipping)")
+        return None, None
+
+def _to_tf_dtype(d) -> tf.dtypes.DType:
+    try:
+        return tf.as_dtype(d)
+    except Exception:
+        if hasattr(d, "dtype"):
+            try:
+                return tf.as_dtype(d.dtype)
+            except Exception:
+                pass
+        if isinstance(d, str):
+            return tf.as_dtype(d)
+        return tf.float32
+
+def assign_via_feed(sess: tf.compat.v1.Session, var, value_np: np.ndarray):
+    """
+    그래프에 대용량 Const를 남기지 않도록 placeholder feed 기반으로 assign.
+    var.dtype가 문자열("float32")이어도 안전하게 동작하도록 tf.as_dtype로 강제변환.
+    """
+    # KerasVariable -> tf.Variable 강제 변환 (그래프 모드에서 안전)
+    var = _as_tf_variable(var)
+    # var.dtype이 'float32' 같은 문자열일 수 있으므로 반드시 캐스팅
+    dtype = tf.as_dtype(getattr(var, "dtype", tf.float32))
+    try:
+        base_dtype = dtype.base_dtype
+    except Exception:
+        base_dtype = dtype
+
+    ph = tf.compat.v1.placeholder(
+        dtype=base_dtype,
+        shape=value_np.shape,
+        name=var.name.split(":")[0] + "_ph",
+    )
+    # 그래프 모드 assign을 명시적으로 사용
+    op = tf.compat.v1.assign(var, ph)
+    sess.run(op, feed_dict={ph: value_np})
+
+def _force_build_dense(layer: tf.keras.layers.Dense, hidden_size: int):
+    dummy = tf.zeros([1, hidden_size], dtype=tf.float32)
+    _ = layer(dummy)
+
+def _debug_dump_embedding_stats(sess, tf_model):
+    """
+    TF1 그래프 모드에서 Keras 3 변수를 안전하게 읽어서 통계를 출력.
+    KerasVariable -> (var.value) -> (read_value()) -> Tensor -> sess.run()
+    """
+    def _eval(sess, var_like):
+        v = _as_tf_variable(var_like)
+        # 최종적으로 Variable/Tensor를 fetch
+        return sess.run(v)
+
+    w = _eval(sess, tf_model.weight)
+    p = _eval(sess, tf_model.position_embeddings)
+    t = _eval(sess, tf_model.token_type_embeddings)
+    g = _eval(sess, tf_model.layerNorm.gamma)
+    b = _eval(sess, tf_model.layerNorm.beta)
+
+    print(f"[check] word_emb  mean={w.mean():.6f} std={w.std():.6f}")
+    print(f"[check] pos_emb   mean={p.mean():.6f} std={p.std():.6f}")
+    print(f"[check] tok_emb   mean={t.mean():.6f} std={t.std():.6f}")
+    print(f"[check] emb_LN γ  mean={g.mean():.6f} std={g.std():.6f}")
+    print(f"[check] emb_LN β  mean={b.mean():.6f} std={b.std():.6f}")
+
+
+def _pt_style_l0(sd, input_ids_np, attention_mask_np, padding_idx=1, eps=1e-5):
+    """PyTorch 수식과 동일한 방식으로 임베딩+LayerNorm(L0) 계산."""
+    we = _np(sd["embeddings.word_embeddings.weight"])
+    pe = _np(sd["embeddings.position_embeddings.weight"])
+    te = _np(sd["embeddings.token_type_embeddings.weight"])  # [type_vocab_size, H]
+    gamma = _np(sd["embeddings.LayerNorm.weight"])
+    beta  = _np(sd["embeddings.LayerNorm.bias"])
+
+    mask = attention_mask_np.astype(np.int32)
+    pos_ids = np.cumsum(mask, axis=1) * mask + padding_idx
+
+    emb = we[input_ids_np] + pe[pos_ids] + te[0]
+    mean = emb.mean(axis=-1, keepdims=True)
+    var  = ((emb - mean) ** 2).mean(axis=-1, keepdims=True)
+    xhat = (emb - mean) / np.sqrt(var + eps)
+    return xhat * gamma + beta
+
+def export_tf1_saved_model(model_name_or_path: str, out_root: str):
+    tf.keras.backend.clear_session()
+    tf.compat.v1.reset_default_graph()
+    tf.compat.v1.disable_eager_execution()
+
+    print("[tokenizer] loading...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+    print("[state_dict] loading...")
+    sd = load_state_dict(model_name_or_path)
+    col_W, col_b = load_colbert_weight(model_name_or_path)
+
+    g = tf.compat.v1.Graph()
+    with g.as_default():
+        # Placeholders
+        input_ids          = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_ids")
+        attention_mask     = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="attention_mask")
+        input_ids_i64      = tf.compat.v1.placeholder(tf.int64, shape=[None, None], name="input_ids_i64")
+        attention_mask_i64 = tf.compat.v1.placeholder(tf.int64, shape=[None, None], name="attention_mask_i64")
+
+        hidden_size = sd["embeddings.word_embeddings.weight"].shape[1]
+        tf_model = BGEM3TensorFlow(
+            vocab_size=sd["embeddings.word_embeddings.weight"].shape[0],
+            max_position_embeddings=sd["embeddings.position_embeddings.weight"].shape[0],
+            type_vocab_size=sd["embeddings.token_type_embeddings.weight"].shape[0],
+            hidden_size=hidden_size,
+            num_hidden_layers=24, num_attention_heads=16, intermediate_size=4096,
+            dropout_rate=0.0, name="bge-m3-tensorflow",
+        )
+
+        # 구현 타입 확인: 모두 TorchLayerNormTF1 이어야 함
+        try:
+            print("[impl] embeddings LN:", type(tf_model.layerNorm))
+            print("[impl] attn LN(0):", type(tf_model.encoder_layers[0].attention.attlayerNorm))
+            print("[impl] ffn  LN(0):", type(tf_model.encoder_layers[0].output_norm))
+        except Exception:
+            pass
+
+        # Build graph (variables created here)
+        outs32 = tf_model({"input_ids": input_ids, "attention_mask": attention_mask},
+                          training=False, output_hidden_states=True)
+        last32  = tf.identity(outs32["last_hidden_state"], name="last_hidden_state")
+        col32   = tf.identity(outs32["colbert_vecs"],     name="colbert_vecs")
+        dense32 = tf.identity(last32[:, 0, :],            name="dense_vecs")      # raw CLS
+        hidd32  = tf.identity(outs32["hidden_states"],    name="hidden_states")
+
+        outs64 = tf_model({"input_ids": tf.cast(input_ids_i64, tf.int32),
+                           "attention_mask": tf.cast(attention_mask_i64, tf.int32)},
+                          training=False, output_hidden_states=True)
+        last64  = tf.identity(outs64["last_hidden_state"], name="last_hidden_state_i64")
+        col64   = tf.identity(outs64["colbert_vecs"],     name="colbert_vecs_i64")
+        dense64 = tf.identity(last64[:, 0, :],            name="dense_vecs_i64")  # raw CLS
+        hidd64  = tf.identity(outs64["hidden_states"],    name="hidden_states_i64")
+
+        # pooler 강제 빌드(검증용)
+        _force_build_dense(tf_model.pooler, hidden_size)
+
+        init = tf.compat.v1.global_variables_initializer()
+        with tf.compat.v1.Session(graph=g) as sess:
+            sess.run(init)
+
+            # === Embedding block ===
+            assign_via_feed(sess, tf_model.weight,                _np(sd["embeddings.word_embeddings.weight"]))
+            assign_via_feed(sess, tf_model.position_embeddings,   _np(sd["embeddings.position_embeddings.weight"]))
+            assign_via_feed(sess, tf_model.token_type_embeddings, _np(sd["embeddings.token_type_embeddings.weight"]))
+            assign_via_feed(sess, tf_model.layerNorm.gamma,       _np(sd["embeddings.LayerNorm.weight"]))
+            assign_via_feed(sess, tf_model.layerNorm.beta,        _np(sd["embeddings.LayerNorm.bias"]))
+
+            # 즉시 숫자 확인 (여기서 이상하면 레이어 0부터 틀어집니다)
+            _debug_dump_embedding_stats(sess, tf_model)
+
+            # === Encoder blocks ===
+            for i, blk in enumerate(tf_model.encoder_layers):
+                assign_via_feed(sess, blk.attention.wq.kernel, _np(sd[f"encoder.layer.{i}.attention.self.query.weight"]).T)
+                assign_via_feed(sess, blk.attention.wq.bias,   _np(sd[f"encoder.layer.{i}.attention.self.query.bias"]))
+                assign_via_feed(sess, blk.attention.wk.kernel, _np(sd[f"encoder.layer.{i}.attention.self.key.weight"]).T)
+                assign_via_feed(sess, blk.attention.wk.bias,   _np(sd[f"encoder.layer.{i}.attention.self.key.bias"]))
+                assign_via_feed(sess, blk.attention.wv.kernel, _np(sd[f"encoder.layer.{i}.attention.self.value.weight"]).T)
+                assign_via_feed(sess, blk.attention.wv.bias,   _np(sd[f"encoder.layer.{i}.attention.self.value.bias"]))
+
+                assign_via_feed(sess, blk.attention.dense.kernel, _np(sd[f"encoder.layer.{i}.attention.output.dense.weight"]).T)
+                assign_via_feed(sess, blk.attention.dense.bias,   _np(sd[f"encoder.layer.{i}.attention.output.dense.bias"]))
+                assign_via_feed(sess, blk.attention.attlayerNorm.gamma,
+                                _np(sd[f"encoder.layer.{i}.attention.output.LayerNorm.weight"]))
+                assign_via_feed(sess, blk.attention.attlayerNorm.beta,
+                                _np(sd[f"encoder.layer.{i}.attention.output.LayerNorm.bias"]))
+
+                assign_via_feed(sess, blk.intermediate.kernel, _np(sd[f"encoder.layer.{i}.intermediate.dense.weight"]).T)
+                assign_via_feed(sess, blk.intermediate.bias,   _np(sd[f"encoder.layer.{i}.intermediate.dense.bias"]))
+                assign_via_feed(sess, blk.output_dense.kernel, _np(sd[f"encoder.layer.{i}.output.dense.weight"]).T)
+                assign_via_feed(sess, blk.output_dense.bias,   _np(sd[f"encoder.layer.{i}.output.dense.bias"]))
+                assign_via_feed(sess, blk.output_norm.gamma,   _np(sd[f"encoder.layer.{i}.output.LayerNorm.weight"]))
+                assign_via_feed(sess, blk.output_norm.beta,    _np(sd[f"encoder.layer.{i}.output.LayerNorm.bias"]))
+
+                if i % 4 == 0 or i == len(tf_model.encoder_layers) - 1:
+                    print(f"[encoder {i}] weights mapped")
+
+            # pooler (검증용, dense_vecs에는 사용 안 함)
+            if "pooler.dense.weight" in sd and "pooler.dense.bias" in sd:
+                assign_via_feed(sess, tf_model.pooler.kernel, _np(sd["pooler.dense.weight"]).T)
+                assign_via_feed(sess, tf_model.pooler.bias,   _np(sd["pooler.dense.bias"]))
+                print("[pooler] initialized")
+
+            # colbert
+            if col_W is not None:
+                assign_via_feed(sess, tf_model.colbert_linear.kernel, col_W.T)
+                if hasattr(tf_model.colbert_linear, "bias") and tf_model.colbert_linear.bias is not None:
+                    if col_b is None:
+                        col_b = np.zeros((col_W.shape[0],), dtype=np.float32)
+                    assign_via_feed(sess, tf_model.colbert_linear.bias, col_b)
+                print(f"[colbert] initialized (out_dim={col_W.shape[0]}, in_dim={col_W.shape[1]})")
+            else:
+                print("[colbert] weights not found; skipping")
+
+            # 내장 Sanity Check: L0가 PT와 붙는지 확인 (저장 이전)
+            try:
+                ids = tokenizer(["hello"], padding=True, truncation=True, max_length=8)
+                inp_ids = np.array(ids["input_ids"], dtype=np.int32)
+                att_msk = np.array(ids["attention_mask"], dtype=np.int32)
+                pt_l0 = _pt_style_l0(sd, inp_ids, att_msk, padding_idx=1, eps=1e-5)
+                tf_l0 = sess.run(hidd32[0], feed_dict={input_ids: inp_ids, attention_mask: att_msk})
+                mse_l0 = np.mean((pt_l0 - tf_l0) ** 2)
+                print(f"[sanity] L0 MSE vs PT: {mse_l0:.8f}")
+                assert mse_l0 < 1e-6, "Embedding+LayerNorm (L0) mismatch; abort saving!"
+            except Exception as e:
+                raise
+
+            # Signatures
+            sig_default = tf.compat.v1.saved_model.signature_def_utils.predict_signature_def(
+                inputs={"input_ids": input_ids, "attention_mask": attention_mask},
+                outputs={"dense_vecs": dense32, "last_hidden_state": last32, "hidden_states": hidd32,
+                         "colbert_vecs": col32},
+            )
+            sig_int64 = tf.compat.v1.saved_model.signature_def_utils.predict_signature_def(
+                inputs={"input_ids_i64": input_ids_i64, "attention_mask_i64": attention_mask_i64},
+                outputs={"dense_vecs_i64": dense64, "last_hidden_state_i64": last64, "hidden_states_i64": hidd64,
+                         "colbert_vecs_i64": col64},
+            )
+
+            export_dir = os.path.join(out_root, "model")
+            os.makedirs(export_dir, exist_ok=True)
+
+            # ★ 여기서 '모든 변수'를 모아 커스텀 Saver를 만든다
+            var_list = _collect_all_variables_for_saver(tf_model)
+            saver = tf.compat.v1.train.Saver(
+                var_list=var_list,
+                write_version=tf.compat.v1.train.SaverDef.V2,
+                save_relative_paths=True,
+            )
+
+            builder = tf.compat.v1.saved_model.Builder(export_dir)
+            builder.add_meta_graph_and_variables(
+                sess,
+                tags=[tf.compat.v1.saved_model.tag_constants.SERVING],
+                signature_def_map={"serving_default": sig_default, "serving_int64": sig_int64},
+                clear_devices=True,
+                saver=saver,  # ★ 커스텀 Saver 지정 (중요)
+            )
+            builder.save()
+            print(f"[export] TF1 SavedModel saved to: {export_dir}")
+
+    tokenizer.save_pretrained(out_root)
+    print(f"[export] tokenizer saved to: {out_root}")
+
+def _as_tf_variable(v):
+    """KerasVariable -> tf.Variable 로 변환 (그래프 모드). 이미 tf.Variable이면 그대로 반환."""
+    try:
+        # 가장 안전: 이미 tf.Variable 계열이면 그대로 사용
+        if isinstance(v, tf.Variable):
+            return v
+    except Exception:
+        pass
+    # Keras 3의 래퍼가 내부 변수에 접근자를 제공할 수 있음
+    inner = getattr(v, "variable", None)
+    if isinstance(inner, tf.Variable):
+        return inner
+    inner2 = getattr(v, "_variable", None)
+    if isinstance(inner2, tf.Variable):
+        return inner2
+    # 일부는 .value가 property일 수 있으나, method인 경우가 있어 호출/반환 지양
+    if hasattr(v, "value") and not callable(getattr(v, "value")):
+        inner3 = getattr(v, "value")
+        if isinstance(inner3, tf.Variable):
+            return inner3
+    return v
+
+def _collect_all_variables_for_saver(tf_model):
+    """
+    Saver에 전달할 '완전한' 변수 목록을 구성.
+    - Keras 3의 tf_model.variables (KerasVariable) 포함
+    - TF1 컬렉션의 global/trainable/model 변수 포함
+    - 이름으로 dedup
+    """
+    vars_from_keras = [_as_tf_variable(v) for v in getattr(tf_model, "variables", [])]
+
+    vars_global   = list(tf.compat.v1.global_variables())
+    vars_train    = list(tf.compat.v1.trainable_variables())
+    try:
+        vars_model = list(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.MODEL_VARIABLES))
+    except Exception:
+        vars_model = []
+
+    all_vars = vars_from_keras + vars_global + vars_train + vars_model
+
+    # 이름 기준 dedup (handle/참조가 달라도 같은 변수면 이름이 동일)
+    dedup = []
+    seen = set()
+    for v in all_vars:
+        try:
+            name = v.name  # e.g. 'bge-m3-tensorflow/...:0'
+        except Exception:
+            continue
+        if name not in seen:
+            seen.add(name)
+            dedup.append(v)
+
+    # 디버그: 저장할 변수 개수/샘플 이름 출력
+    print(f"[saver] variables to save: {len(dedup)}")
+    for nm in list(seen)[:5]:
+        print("  -", nm)
+
+    return dedup
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="BAAI/bge-m3")
+    parser.add_argument("--out",   type=str, default="./converted_bge_m3_tf1_java_fixed")
+    args = parser.parse_args()
+    export_tf1_saved_model(args.model, args.out)
+
+if __name__ == "__main__":
+    main()
diff --git a/model_conversion_validator.py b/model_conversion_validator.py
index a417943..a054ed1 100644
--- a/model_conversion_validator.py
+++ b/model_conversion_validator.py
@@ -1,391 +1,168 @@
-import torch
+# model_conversion_validator.py
 import numpy as np
+import torch
 import tensorflow as tf
 from transformers import AutoTokenizer, AutoModel
 
+
 def load_original_pytorch_model(model_name_or_path):
-    """
-    원본 Hugging Face(PyTorch) 모델 및 토크나이저를 로드한 뒤,
-    (model, tokenizer)를 반환합니다.
-    """
-    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-
-    model = AutoModel.from_pretrained(model_name_or_path)
-    model.eval()  # 평가 모드
-    return model, tokenizer
-
-
-def encode_with_pytorch_model(
-        model,
-        tokenizer,
-        queries,
-        max_length=128,
-        use_cls_pooling=True,
-        return_hidden_states=True
-):
-    """
-    PyTorch 모델로 임베딩 추출하는 함수.
-    use_cls_pooling=True이면 [CLS] 임베딩 반환,
-    False이면 Attention Mask 기반 mean pooling을 반환.
-    return_hidden_states=True 이면, 모든 레이어의 히든 스테이트도 반환.
-    """
-    inputs = tokenizer(
-        queries,
-        padding=True,
-        truncation=True,
-        max_length=max_length,
-        return_tensors='pt'
-    )
+    tok = AutoTokenizer.from_pretrained(model_name_or_path)
+    mdl = AutoModel.from_pretrained(model_name_or_path)
+    mdl.eval()
+    return mdl, tok
 
-    with torch.no_grad():
-        outputs = model(**inputs, output_hidden_states=return_hidden_states)
-        hidden_states = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
 
+def encode_with_pytorch_model(model, tokenizer, queries, max_length=128, use_cls_pooling=True):
+    inputs = tokenizer(queries, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs, output_hidden_states=True)
+        hidden_states = outputs.last_hidden_state  # [B,T,H]
+        all_layer_outputs = outputs.hidden_states  # tuple(len=emb+24)
     if use_cls_pooling:
-        # [CLS] 벡터 사용
-        embeddings = hidden_states[:, 0, :]
-    else:
-        # Mean Pooling
-        attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(hidden_states.size()).float()
-        sum_embeddings = torch.sum(hidden_states * attention_mask, dim=1)
-        sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
-        embeddings = sum_embeddings / sum_mask
-
-    if return_hidden_states:
-        # outputs.hidden_states: 튜플 (embedding_layer_output + 각 Transformer 레이어 출력)
-        all_layer_outputs = outputs.hidden_states  # tuple of torch.Tensor
-        return embeddings.cpu().numpy(), all_layer_outputs
+        emb = hidden_states[:, 0, :].cpu().numpy()
     else:
-        return embeddings.cpu().numpy()
+        attn = inputs["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float()
+        sum_embeddings = torch.sum(hidden_states * attn, dim=1)
+        sum_mask = torch.clamp(attn.sum(dim=1), min=1e-9)
+        emb = (sum_embeddings / sum_mask).cpu().numpy()
+
+
+    return emb, all_layer_outputs
 
 
 def show_all_layer_outputs_pytorch(all_layer_outputs, print_values=False):
-    """
-    PyTorch 레이어별 히든 스테이트의 shape 및 (옵션) 일부 실제 값을 출력하는 유틸 함수.
-    """
     print("\n[PyTorch] All Layer Outputs:")
     for i, hs in enumerate(all_layer_outputs):
-        print(f"  Layer {i} hidden state shape: {hs.shape}")
+        print(f"  Layer {i} hidden state shape: {tuple(hs.shape)}")
         if print_values:
-            # 첫 배치, 첫 토큰, 앞 5개 차원
-            sample_vals = hs[0, 0, :5]
-            print(f"    Sample values (batch=0, token=0, dims=0~4): {sample_vals.cpu().numpy()}")
-    print()
-
-
-def load_converted_tf_model(saved_model_dir):
-    """
-    TF SavedModel 디렉토리에서 모델을 로드하고,
-    같은 경로에 있는 토크나이저를 함께 로드합니다.
-
-    - convert_and_save_model()나 save_model_with_tokenizer()로
-      "model" 폴더와 토크나이저 저장 가정.
-    """
-    model_path = f"{saved_model_dir}/model"
-    loaded_model = tf.saved_model.load(model_path)
-    # Prefer a compatible signature if available
-    prefer = [
-        "serving_default",
-        "serving_int32_3in",
-        "serving_int64_3in",
-        "serving_int32_2in",
-        "serving_int64_2in",
-    ]
-    sigs = loaded_model.signatures
-    for k in prefer:
-        if k in sigs:
-            serving_fn = sigs[k]
-            break
-    else:
-        raise RuntimeError("No suitable serving signature found in SavedModel.")
-
-    tokenizer = AutoTokenizer.from_pretrained(saved_model_dir)
-    return serving_fn, tokenizer
-
-
-def call_signature(sig, input_ids, attention_mask, token_type_ids=None):
-    """
-    Call SavedModel signature with automatic key/dtype adaptation.
-    - Supplies only required keys
-    - Fills missing token_type_ids with zeros
-    - Casts inputs to signature dtypes
-    """
-    # structured_input_signature: (args, kwargs)
-    spec_kwargs = sig.structured_input_signature[1]
-
-    def prepare(name, value):
-        if name not in spec_kwargs:
-            return None
-        if value is None and name == "token_type_ids":
-            value = tf.zeros_like(input_ids)
-        want = spec_kwargs[name].dtype
-        if hasattr(value, "dtype") and value.dtype != want:
-            value = tf.cast(value, want)
-        return value
-
-    kwargs = {}
-    x = prepare("input_ids", input_ids)
-    if x is not None:
-        kwargs["input_ids"] = x
-    x = prepare("attention_mask", attention_mask)
-    if x is not None:
-        kwargs["attention_mask"] = x
-    x = prepare("token_type_ids", token_type_ids)
-    if x is not None:
-        kwargs["token_type_ids"] = x
-
-    return sig(**kwargs)
+            print("    sample:", hs[0, 0, :5].cpu().numpy())
 
 
-def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128):
-    """
-    TensorFlow 모델(서빙 시그니처)로 임베딩 추출하는 함수.
-    SavedModel은 last_hidden_state (B,T,H)만 반환하므로 CLS 풀링을 적용해 (B,H) 임베딩 생성.
-    """
-    inputs = tokenizer(
-        queries,
-        padding=True,
-        truncation=True,
-        max_length=max_length,
-        return_tensors="tf"
-    )
+def load_converted_tf_model(saved_root_dir: str):
+    model_dir = f"{saved_root_dir}/model"
+    loaded = tf.saved_model.load(model_dir)
+    sig = loaded.signatures["serving_default"]
+    tok = AutoTokenizer.from_pretrained(saved_root_dir)
+    return sig, tok
 
-    token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"]))
-    outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids)
-    # Serving returns last_hidden_state (B, T, H); apply CLS pooling for embedding
-    last_hidden = outputs["last_hidden_state"]  # (B, T, H)
-    embeddings = last_hidden[:, 0, :].numpy()  # (B, H)
-
-    return embeddings
-
-
-def encode_with_tf_model_and_get_hidden_states(serving_fn, tokenizer, queries, max_length=128):
-    """
-    *주의*:
-    - TF SavedModel에서 레이어별 히든 스테이트도 반환한다고 가정할 때 사용 가능.
-    - 실제 변환된 모델이 'all_hidden_states'라는 키를 노출하지 않았다면 KeyError 발생 가능.
-    """
-    inputs = tokenizer(
-        queries,
-        padding=True,
-        truncation=True,
-        max_length=max_length,
-        return_tensors="tf"
-    )
 
-    token_type_ids = inputs.get("token_type_ids", tf.zeros_like(inputs["input_ids"]))
-    outputs = call_signature(serving_fn, inputs["input_ids"], inputs["attention_mask"], token_type_ids)
+def call_signature(sig, input_ids, attention_mask):
+    # 강제 int32 캐스트
+    if input_ids.dtype != tf.int32:
+        input_ids = tf.cast(input_ids, tf.int32)
+    if attention_mask.dtype != tf.int32:
+        attention_mask = tf.cast(attention_mask, tf.int32)
+    return sig(input_ids=input_ids, attention_mask=attention_mask)
 
-    # Only last_hidden_state is returned in serving; keep KeyError behavior for old path
-    hidden_states = outputs["hidden_states"]  # will raise KeyError (by design)
-    final_embeddings = outputs["last_hidden_state"]
-    if "colbert_vecs" in outputs:
-        print("outputs['colbert_vecs'] : ")
-        print(outputs["colbert_vecs"])
-    else:
-        print("colbert_vecs not returned by TF model (flag disabled).")
 
-    return final_embeddings.numpy(), hidden_states
+def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128):
+    inputs_pt = tokenizer(queries, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
+    inputs_tf = tokenizer(queries, padding=True, truncation=True, max_length=max_length, return_tensors="tf")
 
+    # 1) 입력 동일성 보장 (매우 중요)
+    assert np.array_equal(inputs_pt["input_ids"].numpy(), inputs_tf["input_ids"].numpy()), "PT/TF input_ids mismatch"
+    assert np.array_equal(inputs_pt["attention_mask"].numpy(), inputs_tf["attention_mask"].numpy()), "PT/TF mask mismatch"
 
-def show_all_layer_outputs_tf(all_layer_outputs, print_values=False):
-    """
-    TensorFlow 레이어별 히든 스테이트 shape와 (옵션) 일부 실제 값을 출력
-    (가정: all_layer_outputs가 (num_layers, batch, seq_len, hidden_dim) 형태)
-    """
-    print("\n[TensorFlow] All Layer Outputs:")
-    for i, hs in enumerate(all_layer_outputs):
-        print(f"  Layer {i} hidden state shape: {hs.shape}")
-        if print_values:
-            # 첫 배치, 첫 토큰, 앞 5개 차원
-            sample_vals = hs[0, 0, :5].numpy()
-            print(f"    Sample values (batch=0, token=0, dims=0~4): {sample_vals}")
-    print()
+    outputs = serving_fn(
+        input_ids=tf.cast(inputs_tf["input_ids"], tf.int32),
+        attention_mask=tf.cast(inputs_tf["attention_mask"], tf.int32),
+    )
+    print(f'outputs >> {outputs}')
+    last_hidden = outputs["last_hidden_state"]    # [B,T,H]
+    emb = last_hidden[:, 0, :].numpy()
+    hiddens = outputs.get("hidden_states", None)  # (L+1,B,T,H)
+    print(f'hiddens, {hiddens}')
+    return emb, (hiddens.numpy() if hiddens is not None else None)
 
 
 def cosine_similarity(a, b):
-    """
-    (batch_size, hidden_dim) 형태 numpy 배열 a, b에 대해
-    벡터별 코사인 유사도(batch_size,) 반환
-    """
-    a_norm = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-9)
-    b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-9)
-    cos_sim = np.sum(a_norm * b_norm, axis=1)
-    return cos_sim
+    a = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-9)
+    b = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-9)
+    return np.sum(a * b, axis=1)
 
 
 def mse(a, b):
     return np.mean((a - b) ** 2)
 
 
-def compare_layer_outputs(pt_all_layer_outputs, tf_all_layer_outputs):
-   """
-   PyTorch vs. TensorFlow 레이어별로 MSE, Cosine Similarity 등을 비교해주는 함수.
-   - pt_all_layer_outputs: tuple of torch.Tensor (길이: num_layers_PyTorch)
-     (예: [embedding_output, layer1_output, layer2_output, ...])
-   - tf_all_layer_outputs: tf.Tensor (shape: [num_layers_TF, batch_size, seq_len, hidden_dim])
-     (예: 0번이 embedding_output, 1번이 1번 레이어, ...)
-   """
-   print("\n=== Compare Layer Outputs (PyTorch vs TensorFlow) ===")
-
-   num_pt_layers = len(pt_all_layer_outputs)
-   num_tf_layers = tf_all_layer_outputs.shape[0]
-   min_layers = min(num_pt_layers, num_tf_layers)
-
-
-   layer_names = {
-       0: "Embedding Layer",
-   }
-   for i in range(1, min_layers):
-       layer_names[i] = f"Encoder Layer {i}"
-
-   print("pt_all_layer_outputs", len(pt_all_layer_outputs))
-
-   print("tf_all_layer_outputs", len(tf_all_layer_outputs))
-
-   for layer_idx in range(min_layers):
-       pt_layer = pt_all_layer_outputs[layer_idx]  # shape: [batch, seq_len, hidden_dim]
-       tf_layer = tf_all_layer_outputs[layer_idx]  # shape: [batch, seq_len, hidden_dim]
-       tf_layer_np = tf_layer.numpy()
+def manual_l0_from_pt(sd, input_ids_np, attention_mask_np, padding_idx=1, eps=1e-5):
+    we = sd["embeddings.word_embeddings.weight"].cpu().numpy().astype(np.float32)
+    pe = sd["embeddings.position_embeddings.weight"].cpu().numpy().astype(np.float32)
+    te = sd["embeddings.token_type_embeddings.weight"].cpu().numpy().astype(np.float32)
+    gamma = sd["embeddings.LayerNorm.weight"].cpu().numpy().astype(np.float32)
+    beta  = sd["embeddings.LayerNorm.bias"].cpu().numpy().astype(np.float32)
 
-       print(f"\n{layer_names[layer_idx]}:")
-       print(f"\n{layer_names[layer_idx]}:")
-       print(f"PyTorch shape: {pt_layer.shape}")
-       print(f"    dims: [batch_size={pt_layer.shape[0]}, seq_len={pt_layer.shape[1]}, hidden_dim={pt_layer.shape[2]}]")
-       print(f"TensorFlow shape: {tf_layer.shape}")
-       print(f"    dims: [batch_size={tf_layer.shape[0]}, seq_len={tf_layer.shape[1]}, hidden_dim={tf_layer.shape[2]}]")
+    # HF와 동일: attention_mask로 포지션 ID 생성
+    mask = attention_mask_np.astype(np.int32)
+    pos_ids = np.cumsum(mask, axis=1) * mask + padding_idx
 
-       layer_mse = mse(pt_layer.detach().cpu().numpy(), tf_layer_np)
-       pt_cls_vec = pt_layer[0, 0, :].detach().cpu().numpy()
+    emb = we[input_ids_np] + pe[pos_ids] + te[0]  # type_vocab_size == 1
+    mean = emb.mean(axis=-1, keepdims=True)
+    var  = ((emb - mean) ** 2).mean(axis=-1, keepdims=True)  # 모집단 분산
+    xhat = (emb - mean) / np.sqrt(var + eps)
+    return xhat * gamma + beta  # (B,T,H)
 
-       tf_cls_vec = tf_layer_np[0, 0, :]
 
 
-       cls_cos_sim = cosine_similarity(pt_cls_vec[np.newaxis, :], tf_cls_vec[np.newaxis, :])[0]
-
-       print(f"  -> MSE: {layer_mse:.6f}")
-       print(f"  -> CLS Token Cosine Similarity: {cls_cos_sim:.6f}")
-
-# ===================== 추가한 함수: 레이어별 출력 비교 =====================
-def compare_layer_outputs1(pt_all_layer_outputs, tf_all_layer_outputs):
-    """
-    PyTorch vs. TensorFlow 레이어별로 MSE, Cosine Similarity 등을 비교해주는 함수.
-    - pt_all_layer_outputs: tuple of torch.Tensor (길이: num_layers_PyTorch)
-      (예: [embedding_output, layer1_output, layer2_output, ...])
-    - tf_all_layer_outputs: tf.Tensor (shape: [num_layers_TF, batch_size, seq_len, hidden_dim])
-      (예: 0번이 embedding_output, 1번이 1번 레이어, ...)
-    """
-    print("\n=== Compare Layer Outputs (PyTorch vs TensorFlow) ===")
-
-    # PyTorch: len(pt_all_layer_outputs) = num_layers_PyTorch
-    # TensorFlow: tf_all_layer_outputs.shape[0] = num_layers_TF
-    num_pt_layers = len(pt_all_layer_outputs)
-    num_tf_layers = tf_all_layer_outputs.shape[0]
-
-    # 두 모델 간 레이어 개수가 다를 수 있으므로, 비교 가능한 만큼만 비교
-    min_layers = min(num_pt_layers, num_tf_layers)
-
-    for layer_idx in range(min_layers):
-        pt_layer = pt_all_layer_outputs[layer_idx]  # shape: [batch, seq_len, hidden_dim]
-        tf_layer = tf_all_layer_outputs[layer_idx]  # shape: [batch, seq_len, hidden_dim]
-        tf_layer_np = tf_layer.numpy()
-
-        # 일단 shape이 같은지 출력
-        print(f"Layer {layer_idx}: PT {pt_layer.shape} vs TF {tf_layer.shape}")
-
-        # MSE 계산
-        layer_mse = mse(pt_layer.detach().cpu().numpy(), tf_layer_np)
-        # Cosine Sim: 여기서는 batch*seq_len 개 각 토큰별 벡터의 평균 코사인 유사도 등
-        # 또는 첫 배치의 첫 토큰만 비교할 수도 있음
-        # 여기서는 간단히 "CLS 토큰(즉 0번 token)에 대한 cos sim" 등 비교 예시
-        pt_cls_vec = pt_layer[0, 0, :].detach().cpu().numpy()
-        tf_cls_vec = tf_layer_np[0, 0, :]
-
-        print(pt_layer)
-        print(tf_layer_np)
-        cls_cos_sim = cosine_similarity(pt_cls_vec[np.newaxis, :], tf_cls_vec[np.newaxis, :])[0]
-
-        print(f"  -> MSE: {layer_mse:.6f},  CLS CosSim: {cls_cos_sim:.6f}")
-    print()
-
 
 def main():
-    # 경로 설정 (예: ./bge-m3, ./converted_bge_m3)
-    model_name_or_path = "BAAI/bge-m3"  # PyTorch 원본
-    saved_model_dir = "./converted_bge_m3"  # TF 변환본
+    pt_id = "BAAI/bge-m3"
+    tf_dir = "./converted_bge_m3_tf1_java_fixed"
 
     queries = [
-        "이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?이 모델은 무엇을 하는 모델인가요?",
+        "이 모델은 무엇을 하는 모델인가요? 이 모델은 무엇을 하는 모델인가요?",
         "이 모델은 무엇을 하는 모델인가요?"
     ]
 
-    print("=== 1) PyTorch 모델 로드 및 인코딩 (레이어별 출력 포함) ===")
-    pt_model, pt_tokenizer = load_original_pytorch_model(model_name_or_path)
-    pt_embeddings, pt_all_layer_outputs = encode_with_pytorch_model(
-        pt_model,
-        pt_tokenizer,
-        queries,
-        max_length=128,
-        use_cls_pooling=True,
-        return_hidden_states=True
-    )
-    show_all_layer_outputs_pytorch(pt_all_layer_outputs, print_values=False)
-
-    print("=== 2) TensorFlow 모델 로드 및 인코딩 ===")
-    tf_serving_fn, tf_tokenizer = load_converted_tf_model(saved_model_dir)
-    tf_embeddings = encode_with_tf_model(
-        tf_serving_fn,
-        tf_tokenizer,
-        queries,
-        max_length=128
-    )
-
-    # (옵션) 레이어별 출력 노출 여부 확인
-    try:
-        tf_embeddings_with_layers, tf_all_layer_outputs = encode_with_tf_model_and_get_hidden_states(
-            tf_serving_fn,
-            tf_tokenizer,
-            queries,
-            max_length=128
-        )
-        show_all_layer_outputs_tf(tf_all_layer_outputs, print_values=False)
-
-        # [추가] 레이어별로 직접 비교
-        compare_layer_outputs(pt_all_layer_outputs, tf_all_layer_outputs)
-
-        print("[TensorFlow] Final Embeddings Shape:", tf_embeddings_with_layers.shape)
-    except KeyError:
-        print("TensorFlow 서빙 시그니처에 hidden_states가 없습니다. (기본 TF 변환본일 가능성)")
-
-    print("\n=== 3) PT vs. TF 최종 임베딩 비교 ===")
-
-    print(pt_embeddings)
-    print(tf_embeddings)
-
-    cos_sims = cosine_similarity(pt_embeddings, tf_embeddings)
-
-    errors = (pt_embeddings - tf_embeddings)
-    mse_val = mse(pt_embeddings, tf_embeddings)
-
-    print("===== Queries =====")
-    for i, q in enumerate(queries):
-        print(f"[{i}] {q}")
-    print()
-
-    print("===== PyTorch Embeddings (shape) =====")
-    print(pt_embeddings.shape)
-    print("===== TF Embeddings (shape) =====")
-    print(tf_embeddings.shape)
-
-    print("\n===== Pairwise Cosine Similarity (PT vs TF) =====")
-    for i, cs in enumerate(cos_sims):
-        print(f"Query {i} Cosine Similarity: {cs:.4f}")
-
-    print(f"\n===== MSE (PT vs TF) =====")
-    print(f"MSE: {mse_val:.6f}")
-
-    print("\n===== Sample Differences (first query, first 5 dims) =====")
-    print(errors[0][:5])
+    print("=== 1) PyTorch ===")
+    pt_model, pt_tok = load_original_pytorch_model(pt_id)
+    pt_emb, pt_layers = encode_with_pytorch_model(pt_model, pt_tok, queries, max_length=128)
+    show_all_layer_outputs_pytorch(pt_layers)
+
+    print("=== 2) TensorFlow ===")
+    tf_sig, tf_tok = load_converted_tf_model(tf_dir)
+    tf_emb, tf_layers = encode_with_tf_model(tf_sig, tf_tok, queries, max_length=128)
+
+    pt_l0 = pt_layers[0].detach().cpu().numpy()  # (B,T,H)
+    tf_l0 = tf_layers[0]  # (B,T,H)
+    print("L0 CLS head(PT)[:8]:", pt_l0[0, 0, :8])
+    print("L0 CLS head(TF)[:8]:", tf_l0[0, 0, :8])
+
+    print("\n=== 3) Compare ===")
+    print("PT shape:", pt_emb.shape, "TF shape:", tf_emb.shape)
+    cs = cosine_similarity(pt_emb, tf_emb)
+    print("Cosine:", ["%.4f" % c for c in cs])
+    print("MSE:", float(mse(pt_emb, tf_emb)))
+
+    # 선택: 레이어별 비교 (있을 때만)
+    print(f'tf_layers, {tf_layers}')
+    if tf_layers is not None:
+        print("\n[Layer-wise] Cosine (PT vs TF):")
+        # pt_layers: tuple(len=L+1), tf_layers: (L+1,B,T,H)
+        tf_layers_np = tf_layers  # (L+1,B,T,H)
+        for i in range(len(pt_layers)):
+            pt_l = pt_layers[i].detach().cpu().numpy()
+            tf_l = tf_layers_np[i]
+            c = cosine_similarity(pt_l[:, 0, :], tf_l[:, 0, :])  # CLS만 비교
+            e = mse(pt_l, tf_l)
+            print(f"  Layer {i:02d}  cos={c.mean():.4f}  mse={e:.6f}")
+
+    from transformers import AutoModel
+    pt_model = AutoModel.from_pretrained("BAAI/bge-m3")
+    sd = pt_model.state_dict()
+    inputs_pt = pt_tok(queries, padding=True, truncation=True, max_length=128, return_tensors="pt")
+
+    l0_manual = manual_l0_from_pt(sd,
+                                  inputs_pt["input_ids"].numpy(),
+                                  inputs_pt["attention_mask"].numpy(),
+                                  padding_idx=1,
+                                  eps=float(pt_model.config.layer_norm_eps))
+
+    pt_l0 = pt_layers[0].detach().cpu().numpy()
+    tf_l0 = tf_layers[0]
+
+    print("Manual vs PT  MSE:", np.mean((l0_manual - pt_l0) ** 2))
+    print("Manual vs TF  MSE:", np.mean((l0_manual - tf_l0) ** 2))
 
 
 if __name__ == "__main__":
diff --git a/tf1_session_validator.py b/tf1_session_validator.py
new file mode 100644
index 0000000..b66c0fc
--- /dev/null
+++ b/tf1_session_validator.py
@@ -0,0 +1,39 @@
+# tf1_session_validator.py
+import argparse
+import numpy as np
+import tensorflow as tf
+tf.compat.v1.disable_eager_execution()
+
+from tensorflow.python.saved_model import loader, tag_constants
+
+SIG = "serving_default"
+
+def inspect_and_run(model_dir: str, b=2, t=12):
+    print(f"[inspect] {model_dir}")
+    g = tf.Graph()
+    with g.as_default():
+        with tf.compat.v1.Session(graph=g) as sess:
+            meta = loader.load(sess, [tag_constants.SERVING], model_dir)
+            sig = meta.signature_def[SIG]
+
+            t_ids  = g.get_tensor_by_name(sig.inputs["input_ids"].name)
+            t_msk  = g.get_tensor_by_name(sig.inputs["attention_mask"].name)
+            t_last = g.get_tensor_by_name(sig.outputs["last_hidden_state"].name)
+            t_colb = g.get_tensor_by_name(sig.outputs["colbert_vecs"].name)
+
+            print(" - inputs :", sig.inputs)
+            print(" - outputs:", sig.outputs)
+
+            ids = np.random.randint(10, 1000, size=(b, t)).astype(np.int32)
+            msk = np.ones((b, t), dtype=np.int32)
+
+            last, colb = sess.run([t_last, t_colb], feed_dict={t_ids: ids, t_msk: msk})
+            print("last_hidden_state:", last.shape, last.dtype)
+            print("colbert_vecs     :", colb.shape, colb.dtype)
+            print("✔ TF1 Session run OK")
+
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dir", type=str, required=False, default="./converted_bge_m3_tf1_v1")
+    args = ap.parse_args()
+    inspect_and_run(args.dir)
diff --git a/tf1_validator.py b/tf1_validator.py
new file mode 100644
index 0000000..95bff8d
--- /dev/null
+++ b/tf1_validator.py
@@ -0,0 +1,165 @@
+# torch_tf_validator.py
+import argparse
+import os
+import numpy as np
+import tensorflow as tf
+import traceback
+
+# =========================================================================
+# 1. TF1 환경 설정 및 상수
+# =========================================================================
+
+# TF1 환경 보장 (TF2 환경에서 실행 시 Eager Execution 비활성화)
+try:
+    tf.compat.v1.disable_v2_behavior()
+    tf.compat.v1.disable_eager_execution()
+    print("INFO: Running in TF1 compatibility mode.")
+    tf1 = tf.compat.v1
+except Exception as e:
+    print("INFO: Running in native TF1 mode (or V2 behavior already disabled).")
+    tf1 = tf
+
+# Constants for TF1 SavedModel loading
+try:
+    SAVED_MODEL_TAG = tf1.saved_model.tag_constants.SERVING  # "serve"
+except AttributeError:
+    # TF1 버전이 매우 낮을 경우 대비
+    SAVED_MODEL_TAG = "serve"
+
+SIGNATURE_KEY = "serving_default"
+
+
+# =========================================================================
+# 2. 검증 실행 함수 (격리된 그래프 사용)
+# =========================================================================
+
+# ★★★ 함수명이 run_validation으로 변경되었습니다. (이전: run_session_once) ★★★
+def run_validation(model_dir: str, dtype=np.int32, batch=2, seqlen=12):
+    print(f"[1] SavedModel 로드 및 시그니처 점검: {model_dir}")
+
+    # ★★★★★★★★★★★ 핵심 수정 사항: 격리된 그래프 생성 ★★★★★★★★★★★
+    # FailedPreconditionError의 원인인 이름 충돌(예: bge_m3_tensorflow_1)을 방지하기 위해,
+    # 모델을 기본(Default) 그래프가 아닌, 완전히 격리된 새 그래프로 로드합니다.
+    graph = tf.Graph()
+
+    with graph.as_default():
+        # 이 깨끗한 그래프와 연결된 세션을 생성합니다.
+        config = tf1.ConfigProto()
+
+        # ★★★ 세션에 명시적으로 그래프 연결 ★★★
+        with tf1.Session(graph=graph, config=config) as sess:
+
+            # --- 1단계: 모델 로드 ---
+            print(f"Loading model into isolated graph...")
+            try:
+                # 모델을 깨끗한 그래프(graph)와 세션(sess)으로 로드 (TF1 loader 사용)
+                # WARNING 메시지는 무시해도 됩니다 (TF2 환경에서 TF1 loader 사용 시 발생)
+                meta_graph_def = tf1.saved_model.loader.load(
+                    sess,
+                    [SAVED_MODEL_TAG],
+                    model_dir
+                )
+            except Exception as e:
+                print(f"ERROR: Failed to load SavedModel from {model_dir}. Error: {e}")
+                return
+
+            # --- 2단계: 시그니처 점검 ---
+            if SIGNATURE_KEY not in meta_graph_def.signature_def:
+                print(f"ERROR: Signature '{SIGNATURE_KEY}' not found.")
+                return
+
+            signature_def = meta_graph_def.signature_def[SIGNATURE_KEY]
+            print(f" - 사용 시그니처: {SIGNATURE_KEY}")
+
+            # Shape 출력을 위한 헬퍼 함수 (TF1 방식)
+            def format_shape(tensor_info):
+                try:
+                    # TensorShapeProto에서 shape 추출
+                    return [d.size for d in tensor_info.tensor_shape.dim]
+                except:
+                    return "Unknown"
+
+            print(" - 입력들:")
+            for key, tensor_info in signature_def.inputs.items():
+                print(
+                    f"   • key='{key}', dtype={tf.dtypes.as_dtype(tensor_info.dtype).name}, shape={format_shape(tensor_info)}, name='{tensor_info.name}'")
+
+            print(" - 출력들:")
+            for key, tensor_info in signature_def.outputs.items():
+                print(
+                    f"   • key='{key}', dtype={tf.dtypes.as_dtype(tensor_info.dtype).name}, shape={format_shape(tensor_info)}, name='{tensor_info.name}'")
+
+            # --- 3단계: 추론 테스트 실행 ---
+            print(f"\n[2] TF1 세션으로 1회 추론 실행 (입력 dtype={dtype.__name__}, B={batch}, T={seqlen})")
+
+            # 더미 입력 데이터 준비 (int32 요구)
+            input_ids_data = np.random.randint(100, 10000, size=(batch, seqlen)).astype(dtype)
+            attention_mask_data = np.ones((batch, seqlen)).astype(dtype)
+
+            # 시그니처에서 입출력 텐서 이름 식별
+            try:
+                input_ids_tname = signature_def.inputs['input_ids'].name
+                attention_mask_tname = signature_def.inputs['attention_mask'].name
+                last_h_tname = signature_def.outputs['last_hidden_state'].name
+                colbert_tname = signature_def.outputs['colbert_vecs'].name
+            except KeyError as e:
+                print(f"ERROR: Expected tensor key not found in signature: {e}")
+                return
+
+            feed_dict = {
+                input_ids_tname: input_ids_data,
+                attention_mask_tname: attention_mask_data,
+            }
+
+            fetches = [last_h_tname, colbert_tname]
+
+            # 추론 실행
+            try:
+                print("Running session...")
+                # ★★★ 그래프가 격리되었으므로 성공해야 합니다. ★★★
+                last_h, colbert = sess.run(fetches, feed_dict=feed_dict)
+
+                print("\n[SUCCESS] Inference successful!")
+                print(f" - last_hidden_state shape: {last_h.shape}, dtype: {last_h.dtype}")
+                print(f" - colbert_vecs shape: {colbert.shape}, dtype: {colbert.dtype}")
+
+            except Exception as e:
+                print(f"\n[FAILURE] ERROR during inference: {type(e).__name__}")
+                if "FailedPreconditionError" in str(type(e)):
+                    print("FailedPreconditionError가 여전히 발생했습니다.")
+                    print("이는 모델 변환 과정(BGEM3WeightConverter.py)에서 이미 이름 불일치가 발생하여 저장되었음을 의미합니다.")
+                    print("해결 방법: 모델 폴더(converted_bge_m3_tf1safe)를 삭제하고, 완전히 새로운 터미널에서 변환 스크립트를 다시 실행 후 검증하세요.")
+                traceback.print_exc()
+
+
+# =========================================================================
+# 3. 실행 로직
+# =========================================================================
+
+def main():
+    default_model_dir = "./converted_bge_m3_tf1safe"
+
+    parser = argparse.ArgumentParser(description="Validate converted BGE-M3 TensorFlow SavedModel in TF1 environment.")
+    parser.add_argument("--model_dir", type=str, default=default_model_dir,
+                        help="Path to the SavedModel directory (e.g., converted_bge_m3_tf1safe)")
+    args = parser.parse_args()
+
+    # 경로 확인 로직 (model 하위 폴더 자동 탐색)
+    model_path = args.model_dir
+
+    # 1. 지정된 경로 확인
+    if os.path.exists(os.path.join(model_path, "saved_model.pb")):
+        pass  # 경로 정상
+    # 2. 하위 'model' 폴더 확인
+    elif os.path.exists(os.path.join(model_path, "model", "saved_model.pb")):
+        model_path = os.path.join(model_path, "model")
+    else:
+        print(f"Error: saved_model.pb not found in {args.model_dir} or {os.path.join(args.model_dir, 'model')}.")
+        return
+
+    # ★★★ 수정된 함수 호출 ★★★
+    run_validation(model_path, dtype=np.int32, batch=2, seqlen=12)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tf_colbert_loader.py b/tf_colbert_loader.py
index 8005891..7625cb6 100644
--- a/tf_colbert_loader.py
+++ b/tf_colbert_loader.py
@@ -43,7 +43,7 @@ def call_signature(sig, input_ids, attention_mask, token_type_ids=None):
 
 def main():
     parser = argparse.ArgumentParser(description="Load TF ColBERT SavedModel and print output shapes.")
-    parser.add_argument("--model_dir", default="./converted_bge_m3", help="Path to SavedModel root (contains tokenizer files).")
+    parser.add_argument("--model_dir", default="./converted_bge_m3_tf1_java", help="Path to SavedModel root (contains tokenizer files).")
     parser.add_argument("--max_length", type=int, default=128, help="Tokenization max length.")
     parser.add_argument("--texts", nargs="*", default=[
         "이 모델은 무엇을 하는 모델인가요?",
diff --git a/torch_tf_validator.py b/torch_tf_validator.py
index b745ab4..1e39666 100644
--- a/torch_tf_validator.py
+++ b/torch_tf_validator.py
@@ -1,127 +1,81 @@
-import loguru 
-
-from transformers import AutoTokenizer, AutoModel
-import tensorflow as tf 
-import torch 
-
-def load_torch_model(model_path):
-    model = AutoModel.from_pretrained(model_path)
-    return model
-
-
-def load_tf_model(model_path):
-    with tf.device("/CPU:0"):
-        model = tf.saved_model.load(model_path)
-    return model
-
-
-def load_tokenizer(model_path):
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    return tokenizer
-
-
-def tokenize_wo_padding(tokenizer, text, return_tensors="pt"):
-    return tokenizer(text, padding=False, return_tensors=return_tensors)
-
-
-def tokenize_w_padding(tokenizer, text, return_tensors="pt", max_length=512):
-    return tokenizer(text, padding="max_length", max_length=max_length, return_tensors=return_tensors)
-
-
-def call_signature(sig, input_ids, attention_mask, token_type_ids=None):
-    """Adapt arguments to a SavedModel signature: keys and dtypes."""
-    spec_kwargs = sig.structured_input_signature[1]
-
-    def prepare(name, value):
-        if name not in spec_kwargs:
-            return None
-        if value is None and name == "token_type_ids":
-            value = tf.zeros_like(input_ids)
-        want = spec_kwargs[name].dtype
-        if hasattr(value, "dtype") and value.dtype != want:
-            value = tf.cast(value, want)
-        return value
-
-    kwargs = {}
-    x = prepare("input_ids", input_ids)
-    if x is not None:
-        kwargs["input_ids"] = x
-    x = prepare("attention_mask", attention_mask)
-    if x is not None:
-        kwargs["attention_mask"] = x
-    x = prepare("token_type_ids", token_type_ids)
-    if x is not None:
-        kwargs["token_type_ids"] = x
-    return sig(**kwargs)
-
+# torch_tf_validator.py
+import argparse
+import numpy as np
+import tensorflow as tf
+
+tf.compat.v1.disable_eager_execution()
+
+TAG_SERVE = tf.saved_model.SERVING
+SIG_NAME = "serving_default"
+
+def _tensor_shape_to_list(tensor_shape_proto):
+    dims = tensor_shape_proto.dim
+    if not dims:
+        return None
+    return [d.size for d in dims]
+
+def inspect_signature(model_dir: str):
+    print(f"\n[1] SavedModel 로드 및 시그니처 점검: {model_dir}")
+    g = tf.Graph()
+    with g.as_default():
+        with tf.compat.v1.Session(graph=g) as sess:
+            meta_graph_def = tf.compat.v1.saved_model.load(sess, [TAG_SERVE], model_dir)
+            sigs = meta_graph_def.signature_def
+            if SIG_NAME not in sigs:
+                raise RuntimeError(f"Signature '{SIG_NAME}' not found. Available: {list(sigs.keys())}")
+            sig = sigs[SIG_NAME]
+            print(f" - 사용 시그니처: {SIG_NAME}")
+            print(" - 입력들:")
+            for k, tinfo in sig.inputs.items():
+                print(f"    • key='{k}', dtype={tf.DType(tinfo.dtype).name}, "
+                      f"shape={_tensor_shape_to_list(tinfo.tensor_shape)}, name='{tinfo.name}'")
+            print(" - 출력들:")
+            for k, tinfo in sig.outputs.items():
+                print(f"    • key='{k}', dtype={tf.DType(tinfo.dtype).name}, "
+                      f"shape={_tensor_shape_to_list(tinfo.tensor_shape)}, name='{tinfo.name}'")
+
+def run_session_once(model_dir: str, dtype=np.int32, batch=2, seqlen=12):
+    print(f"\n[2] TF1 세션으로 1회 추론 실행 (입력 dtype={np.dtype(dtype).name}, B={batch}, T={seqlen})")
+    g = tf.Graph()
+    with g.as_default():
+        with tf.compat.v1.Session(graph=g) as sess:
+            meta_graph_def = tf.compat.v1.saved_model.load(sess, [TAG_SERVE], model_dir)
+            sig = meta_graph_def.signature_def[SIG_NAME]
+
+            t_input_ids = g.get_tensor_by_name(sig.inputs["input_ids"].name)
+            t_attention = g.get_tensor_by_name(sig.inputs["attention_mask"].name)
+            t_last = g.get_tensor_by_name(sig.outputs["last_hidden_state"].name)
+            t_colbert = g.get_tensor_by_name(sig.outputs["colbert_vecs"].name)
+
+            input_ids = np.random.randint(10, 1000, size=(batch, seqlen)).astype(dtype)
+            attention_mask = np.ones((batch, seqlen), dtype=dtype)
+
+            last_h, colbert = sess.run(
+                [t_last, t_colbert],
+                feed_dict={t_input_ids: input_ids, t_attention: attention_mask}
+            )
+            print(" - last_hidden_state:", last_h.shape, last_h.dtype)
+            print(" - colbert_vecs     :", colbert.shape, colbert.dtype)
+            assert last_h.dtype == np.float32 and colbert.dtype == np.float32
+            assert last_h.shape == (batch, seqlen, 1024)
+            print(colbert.shape)
+            #assert colbert.shape == (batch, seqlen, 1024)
+            print(" ✔ 세션 추론 성공")
 
 def main():
-    # Load the model
-    model_path = "BAAI/bge-m3"
-    model_path_tf = "/workspace/BGE-M3-Model-Converter/model"
-    model = load_torch_model(model_path)
-    tokenizer = load_tokenizer(model_path)
-
-    # Tokenize the text
-    text = "Hello, my dog is cute"
-    inputs = tokenize_wo_padding(tokenizer, text)
-    inputs_w_padding = tokenize_w_padding(tokenizer, text)
-
-    # Get the output from the model
-    loguru.logger.info("Torch] Model output".ljust(50, "-"))
-    model.eval().to("cuda")
-    with torch.no_grad():
-        inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        inputs_w_padding = {k: v.to("cuda") for k, v in inputs_w_padding.items()}
-
-        output = model(**inputs)
-        output_w_padding = model(**inputs_w_padding)
-        loguru.logger.info("output without padding (GT)".ljust(50, "-"))
-        loguru.logger.info(output['last_hidden_state'][:, 0])
-        loguru.logger.info("="*50)
-        loguru.logger.info("output with padding".ljust(50, "-"))
-        loguru.logger.info(output_w_padding['last_hidden_state'][:, 0])
-        loguru.logger.info("="*50)
-        err = torch.abs(output['last_hidden_state'][:, 0] - output_w_padding['last_hidden_state'][:, 0])
-        loguru.logger.info("Error".ljust(50, "-"))
-        loguru.logger.info(err.mean())
-    
-    inputs_tf = tokenize_wo_padding(tokenizer, text, return_tensors="tf")
-    inputs_tf_w_padding = tokenize_w_padding(tokenizer, text, return_tensors="tf")
-    loaded = load_tf_model(model_path_tf)
-    # Use the default 2-input signature
-    sigs = loaded.signatures
-    tf_model = sigs.get("serving_default")
-    if tf_model is None:
-        raise RuntimeError("serving_default signature not found")
-
-    loguru.logger.info("Tensorflow] Model output".ljust(50, "-"))
-    with tf.device("/GPU:0"):
-        output_tf = call_signature(
-            tf_model, inputs_tf["input_ids"], inputs_tf["attention_mask"], None
-        )
-        output_tf_w_padding = call_signature(
-            tf_model,
-            inputs_tf_w_padding["input_ids"],
-            inputs_tf_w_padding["attention_mask"],
-            None,
-        )
-        loguru.logger.info("output without padding (GT)".ljust(50, "-"))
-        hs = output_tf['last_hidden_state']
-        val_no_pad = hs[:, 0]
-        loguru.logger.info(val_no_pad)
-        loguru.logger.info("="*50)
-        loguru.logger.info("output with padding".ljust(50, "-"))
-        hsw = output_tf_w_padding['last_hidden_state']
-        val_pad = hsw[:, 0]
-        loguru.logger.info(val_pad)
-        loguru.logger.info("="*50)
-        err_tf = tf.abs(val_no_pad - val_pad)
-        loguru.logger.info("Error".ljust(50, "-"))
-        loguru.logger.info(tf.reduce_mean(err_tf))
-        loguru.logger.info("="*50)
-        
-
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model_dir", type=str, default="converted_bge_m3_tf1_java_fixed/model", help="Path to SavedModel directory (…/model)")
+    args = ap.parse_args()
+
+    inspect_signature(args.model_dir)
+    run_session_once(args.model_dir, dtype=np.int32, batch=2, seqlen=12)
+
+    print("\n[3] (의도적) int64로 재실행 → 보통 dtype mismatch로 실패하거나 내부 캐스팅 없으면 에러")
+    try:
+        run_session_once(args.model_dir, dtype=np.int64, batch=2, seqlen=12)
+        print("※ int64 입력이 통과하면, 시그니처가 int64이거나 내부 캐스팅이 있는 경우입니다.")
+    except Exception as e:
+        print(" ✔ 기대된 실패(입력 dtype 불일치):", type(e).__name__, str(e)[:200], "…")
 
 if __name__ == "__main__":
     main()

From c8588163bd98955972cbc871cc0551173fe51651 Mon Sep 17 00:00:00 2001
From: sigridjineth <sigrid.jinhyung@gmail.com>
Date: Sun, 7 Sep 2025 18:41:58 +0900
Subject: [PATCH 3/3] validate colbert

---
 export_tf1_saved_model.py     |  90 ++++++++++++++++-------
 model_conversion_validator.py | 133 +++++++++++++++++++++++++++++++++-
 2 files changed, 196 insertions(+), 27 deletions(-)

diff --git a/export_tf1_saved_model.py b/export_tf1_saved_model.py
index 38cebb7..5f06da1 100644
--- a/export_tf1_saved_model.py
+++ b/export_tf1_saved_model.py
@@ -4,7 +4,8 @@
 import torch
 import tensorflow as tf
 from transformers import AutoTokenizer
-# from BGEM3TFModel_tfkeras2 import BGEM3TensorFlow
+from BGEM3TFModel_tfkeras2 import BGEM3TensorFlow
+from huggingface_hub import snapshot_download  # NEW
 
 os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "1")
 
@@ -28,31 +29,70 @@ def load_state_dict(model_or_path: str) -> dict:
     print(f"[load] pytorch_model.bin: {pt}")
     return torch.load(pt, map_location="cpu")
 
-def load_colbert_weight(model_or_path: str):
-    try:
-        if os.path.isdir(model_or_path):
-            p = os.path.join(model_or_path, "colbert_linear.pt")
-        else:
-            from huggingface_hub import snapshot_download
-            local = snapshot_download(repo_id=model_or_path)
-            p = os.path.join(local, "colbert_linear.pt")
-        st = torch.load(p, map_location="cpu")
-        if isinstance(st, dict):
-            W = st.get("weight")
-            B = st.get("bias")
-            if W is None:
-                first_key = next(iter(st))
-                W = st[first_key]
-                B = st.get("bias", None)
+def load_colbert_weight(model_name_or_path: str):
+    """
+    Hugging Face repo(또는 로컬 폴더)에서 colbert_linear.pt를 읽어 weight/bias를 numpy(float32)로 반환
+    weight: (out_dim, in_dim), bias: (out_dim,)
+    """
+    if os.path.isdir(model_name_or_path):
+        p = os.path.join(model_name_or_path, "colbert_linear.pt")
+    else:
+        local = snapshot_download(repo_id=model_name_or_path)
+        p = os.path.join(local, "colbert_linear.pt")
+
+    st = torch.load(p, map_location="cpu")
+    if isinstance(st, dict):
+        W = st.get("weight")
+        B = st.get("bias")
+        if W is None:  # 혹시 키 이름이 다르면 첫 텐서를 weight로 간주
+            first_key = next(iter(st))
+            W = st[first_key]
+            B = st.get("bias", None)
+    else:
+        # 리스트/튜플 또는 텐서
+        if isinstance(st, (list, tuple)):
+            W, B = st
         else:
-            W, B = (st, None) if not isinstance(st, (list, tuple)) else st
-        W = _np(W)
-        B = _np(B) if B is not None else None
-        print(f"[colbert] head: out_dim={W.shape[0]}, in_dim={W.shape[1]}, bias={'yes' if B is not None else 'no'}")
-        return W, B
-    except Exception as e:
-        print(f"[colbert] not found ({e}); skipping)")
-        return None, None
+            W, B = st, None
+
+    W = W.detach().cpu().numpy().astype(np.float32)
+    B = (
+        B.detach().cpu().numpy().astype(np.float32)
+        if B is not None
+        else np.zeros((W.shape[0],), np.float32)
+    )
+    print(f"[colbert] head: out_dim={W.shape[0]}, in_dim={W.shape[1]}, bias={'yes' if B is not None else 'no'}")
+    return W, B
+
+
+def project_colbert_pt(
+    last_hidden_np: np.ndarray,
+    attention_mask_np: np.ndarray,
+    W: np.ndarray,
+    b: np.ndarray,
+) -> np.ndarray:
+    """
+    PT last_hidden_state로부터 ColBERT head 적용 결과를 계산
+    last_hidden_np: (B,T,H)  / attention_mask_np: (B,T)  / W: (O,H)  / b: (O,)
+    반환: masked colbert vecs, shape (B,T-1,O)
+    """
+    # 1) CLS 제외
+    x = last_hidden_np[:, 1:, :]  # (B, T-1, H)
+    # 2) 선형 사상: x @ W^T + b, einsum 'bth,oh->bto'
+    y = np.einsum('bth,oh->bto', x, W) + b[None, None, :]
+    # 3) 마스킹
+    submask = attention_mask_np[:, 1:].astype(np.float32)  # (B, T-1)
+    y = y * submask[:, :, None]
+    return y
+
+
+def cosine_rowwise(a: np.ndarray, b: np.ndarray, eps: float = 1e-9) -> np.ndarray:
+    """
+    마지막 축(특징 축) 기준 코사인 유사도. a,b: (...,D) -> (...)
+    """
+    a_n = a / (np.linalg.norm(a, axis=-1, keepdims=True) + eps)
+    b_n = b / (np.linalg.norm(b, axis=-1, keepdims=True) + eps)
+    return np.sum(a_n * b_n, axis=-1)
 
 def _to_tf_dtype(d) -> tf.dtypes.DType:
     try:
diff --git a/model_conversion_validator.py b/model_conversion_validator.py
index a054ed1..f07ff88 100644
--- a/model_conversion_validator.py
+++ b/model_conversion_validator.py
@@ -3,6 +3,7 @@
 import torch
 import tensorflow as tf
 from transformers import AutoTokenizer, AutoModel
+from huggingface_hub import snapshot_download
 
 
 def load_original_pytorch_model(model_name_or_path):
@@ -72,7 +73,8 @@ def encode_with_tf_model(serving_fn, tokenizer, queries, max_length=128):
     emb = last_hidden[:, 0, :].numpy()
     hiddens = outputs.get("hidden_states", None)  # (L+1,B,T,H)
     print(f'hiddens, {hiddens}')
-    return emb, (hiddens.numpy() if hiddens is not None else None)
+    colbert = outputs.get("colbert_vecs", None)
+    return emb, (hiddens.numpy() if hiddens is not None else None), (colbert.numpy() if colbert is not None else None)
 
 
 def cosine_similarity(a, b):
@@ -103,6 +105,64 @@ def manual_l0_from_pt(sd, input_ids_np, attention_mask_np, padding_idx=1, eps=1e
     return xhat * gamma + beta  # (B,T,H)
 
 
+# === ColBERT helpers ===
+def load_colbert_weight(model_name_or_path: str):
+    """
+    Load colbert_linear.pt from local path or HF repo and return W, b as float32 numpy arrays.
+    W: (out_dim, in_dim), b: (out_dim,)
+    """
+    import os
+    if os.path.isdir(model_name_or_path):
+        p = os.path.join(model_name_or_path, "colbert_linear.pt")
+    else:
+        local = snapshot_download(repo_id=model_name_or_path)
+        p = os.path.join(local, "colbert_linear.pt")
+
+    st = torch.load(p, map_location="cpu")
+    if isinstance(st, dict):
+        W = st.get("weight")
+        B = st.get("bias")
+        if W is None:
+            first_key = next(iter(st))
+            W = st[first_key]
+            B = st.get("bias", None)
+    else:
+        if isinstance(st, (list, tuple)):
+            W, B = st
+        else:
+            W, B = st, None
+
+    W = W.detach().cpu().numpy().astype(np.float32)
+    B = (
+        B.detach().cpu().numpy().astype(np.float32)
+        if B is not None
+        else np.zeros((W.shape[0],), np.float32)
+    )
+    return W, B
+
+
+def project_colbert_pt(last_hidden_np: np.ndarray,
+                       attention_mask_np: np.ndarray,
+                       W: np.ndarray,
+                       b: np.ndarray) -> np.ndarray:
+    """
+    Apply ColBERT head on PT last_hidden_state and mask out padding.
+    last_hidden_np: (B,T,H), attention_mask_np: (B,T), W: (O,H), b: (O,) -> returns (B,T-1,O)
+    """
+    x = last_hidden_np[:, 1:, :]  # remove CLS
+    y = np.einsum('bth,oh->bto', x, W) + b[None, None, :]
+    submask = attention_mask_np[:, 1:].astype(np.float32)
+    y = y * submask[:, :, None]
+    return y
+
+
+def cosine_rowwise(a: np.ndarray, b: np.ndarray, eps: float = 1e-9) -> np.ndarray:
+    """Cosine similarity along the last dimension."""
+    a_n = a / (np.linalg.norm(a, axis=-1, keepdims=True) + eps)
+    b_n = b / (np.linalg.norm(b, axis=-1, keepdims=True) + eps)
+    return np.sum(a_n * b_n, axis=-1)
+
+
 
 
 def main():
@@ -121,7 +181,7 @@ def main():
 
     print("=== 2) TensorFlow ===")
     tf_sig, tf_tok = load_converted_tf_model(tf_dir)
-    tf_emb, tf_layers = encode_with_tf_model(tf_sig, tf_tok, queries, max_length=128)
+    tf_emb, tf_layers, tf_colbert = encode_with_tf_model(tf_sig, tf_tok, queries, max_length=128)
 
     pt_l0 = pt_layers[0].detach().cpu().numpy()  # (B,T,H)
     tf_l0 = tf_layers[0]  # (B,T,H)
@@ -164,6 +224,75 @@ def main():
     print("Manual vs PT  MSE:", np.mean((l0_manual - pt_l0) ** 2))
     print("Manual vs TF  MSE:", np.mean((l0_manual - tf_l0) ** 2))
 
+    # === 4) ColBERT head validation (masked) ===
+    try:
+        Wc, bc = load_colbert_weight(pt_id)
+        batch_pt = pt_tok(queries, padding=True, truncation=True, max_length=128, return_tensors="pt")
+        inputs_mask = batch_pt["attention_mask"].numpy().astype(np.int32)  # (B,T)
+
+        # PT 측 colbert 투영
+        pt_last_hidden_np = pt_layers[-1].detach().cpu().numpy()  # (B,T,H)
+        pt_colbert = project_colbert_pt(pt_last_hidden_np, inputs_mask, Wc, bc)  # (B,T-1,O)
+
+        if tf_colbert is None:
+            print("[ColBERT] TF colbert_vecs not present in signature; skipped")
+        else:
+            # 시간축 동기화 (이론상 T-1 동일)
+            min_T = min(pt_colbert.shape[1], tf_colbert.shape[1])
+            ptc = pt_colbert[:, :min_T, :]
+            tfc = tf_colbert[:, :min_T, :]
+
+            # 유효토큰 마스크 (CLS 제외)
+            valid_mask = (inputs_mask[:, 1:][:, :min_T] == 1)  # (B, min_T)
+            # 평탄화 후 유효토큰만 선택
+            pt_flat = ptc.reshape(-1, ptc.shape[-1])[valid_mask.reshape(-1)]
+            tf_flat = tfc.reshape(-1, tfc.shape[-1])[valid_mask.reshape(-1)]
+
+            # 영벡터 제거(정규화시 왜곡 방지)
+            keep = (np.linalg.norm(pt_flat, axis=1) > 1e-12) | (np.linalg.norm(tf_flat, axis=1) > 1e-12)
+            pt_flat = pt_flat[keep]
+            tf_flat = tf_flat[keep]
+
+            col_mse_valid = mse(pt_flat, tf_flat)
+            col_cos_valid = cosine_rowwise(pt_flat, tf_flat).mean()
+            print(f"\n[ColBERT(valid)] mse={col_mse_valid:.8f}  cos={col_cos_valid:.6f}")
+
+            # 참고: 모든 위치(패딩 포함) 지표도 함께 출력
+            col_mse_all = mse(ptc, tfc)
+            col_cos_all = cosine_rowwise(
+                ptc.reshape(-1, ptc.shape[-1]), tfc.reshape(-1, tfc.shape[-1])
+            ).mean()
+            print(f"[ColBERT(all-pos)] mse={col_mse_all:.8f}  cos={col_cos_all:.6f}")
+
+        # === 5) last_hidden_state (masked) 비교 ===
+        pt_last = pt_layers[-1].detach().cpu().numpy()  # (B,T,H)
+        tf_last = tf_layers[-1]                          # (B,T,H)
+        min_T2 = min(pt_last.shape[1], tf_last.shape[1])
+        pt_last = pt_last[:, :min_T2, :]
+        tf_last = tf_last[:, :min_T2, :]
+        valid_mask2 = (inputs_mask[:, :min_T2] == 1)
+        pt_flat2 = pt_last.reshape(-1, pt_last.shape[-1])[valid_mask2.reshape(-1)]
+        tf_flat2 = tf_last.reshape(-1, tf_last.shape[-1])[valid_mask2.reshape(-1)]
+        mse_last = mse(pt_flat2, tf_flat2)
+        cos_last = cosine_rowwise(pt_flat2, tf_flat2).mean()
+        print(f"[last_hidden(valid)] mse={mse_last:.8f}  cos={cos_last:.6f}")
+
+        # === 6) Determinism check ===
+        batch_tf = tf_tok(queries, padding=True, truncation=True, max_length=128, return_tensors="tf")
+        outs1 = tf_sig(
+            input_ids=tf.cast(batch_tf["input_ids"], tf.int32),
+            attention_mask=tf.cast(batch_tf["attention_mask"], tf.int32),
+        )
+        outs2 = tf_sig(
+            input_ids=tf.cast(batch_tf["input_ids"], tf.int32),
+            attention_mask=tf.cast(batch_tf["attention_mask"], tf.int32),
+        )
+        lh1 = outs1["last_hidden_state"].numpy()
+        lh2 = outs2["last_hidden_state"].numpy()
+        print("[determinism] max_abs_diff:", float(np.max(np.abs(lh1 - lh2))))
+    except Exception as e:
+        print(f"[ColBERT] skipped: {e}")
+
 
 if __name__ == "__main__":
     main()