diff --git a/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py b/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py
index 5adc1aac1a..988db0dc7b 100644
--- a/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py
+++ b/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py
@@ -96,7 +96,7 @@
         decoder=dict(codec, heatmap_size=codec['input_size'])),
     test_cfg=dict(
         multiscale_test=False,
-        flip_test=True,
+        flip_test=False,
         shift_heatmap=True,
         restore_heatmap_size=True,
         align_corners=False))
@@ -113,9 +113,14 @@
     dict(
         type='BottomupResize',
         input_size=codec['input_size'],
-        size_factor=32,
+        size_factor=64,
         resize_mode='expand'),
-    dict(type='PackPoseInputs')
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+                   'img_shape', 'input_size', 'input_center', 'input_scale',
+                   'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+                   'skeleton_links'))
 ]
 
 # data loaders
diff --git a/mmpose/codecs/associative_embedding.py b/mmpose/codecs/associative_embedding.py
index 7e080f1657..9c9a1f0a6e 100644
--- a/mmpose/codecs/associative_embedding.py
+++ b/mmpose/codecs/associative_embedding.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import namedtuple
+# from copy import deepcopy
 from itertools import product
 from typing import Any, List, Optional, Tuple
 
 import numpy as np
 import torch
+# from mmengine import dump
 from munkres import Munkres
 from torch import Tensor
 
@@ -75,7 +77,9 @@ def _init_group():
             tag_list=[])
         return _group
 
-    for i in keypoint_order:
+    # group_history = []
+
+    for idx, i in enumerate(keypoint_order):
         # Get all valid candidate of the i-th keypoints
         valid = vals[i] > val_thr
         if not valid.any():
@@ -87,12 +91,22 @@ def _init_group():
 
         if len(groups) == 0:  # Initialize the group pool
             for tag, val, loc in zip(tags_i, vals_i, locs_i):
+
+                # Check if the keypoint belongs to existing groups
+                if len(groups):
+                    prev_tags = np.stack([g.tag_list[0] for g in groups])
+                    dists = np.linalg.norm(prev_tags - tag, ord=2, axis=1)
+                    if dists.min() < 1:
+                        continue
+
                 group = _init_group()
                 group.kpts[i] = loc
                 group.scores[i] = val
                 group.tag_list.append(tag)
 
                 groups.append(group)
+            # costs_copy = None
+            matches = None
 
         else:  # Match keypoints to existing groups
             groups = groups[:max_groups]
@@ -101,17 +115,18 @@ def _init_group():
             # Calculate distance matrix between group tags and tag candidates
             # of the i-th keypoint
             # Shape: (M', 1, L) , (1, G, L) -> (M', G, L)
-            diff = tags_i[:, None] - np.array(group_tags)[None]
+            diff = (tags_i[:, None] -
+                    np.array(group_tags)[None]).astype(np.float64)
             dists = np.linalg.norm(diff, ord=2, axis=2)
             num_kpts, num_groups = dists.shape[:2]
 
-            # Experimental cost function for keypoint-group matching
+            # Experimental cost function for keypoint-group matching2
             costs = np.round(dists) * 100 - vals_i[..., None]
+
             if num_kpts > num_groups:
-                padding = np.full((num_kpts, num_kpts - num_groups),
-                                  1e10,
-                                  dtype=np.float32)
+                padding = np.full((num_kpts, num_kpts - num_groups), 1e10)
                 costs = np.concatenate((costs, padding), axis=1)
+            # costs_copy = costs.copy()
 
             # Match keypoints and groups by Munkres algorithm
             matches = munkres.compute(costs)
@@ -121,13 +136,30 @@ def _init_group():
                     # Add the keypoint to the matched group
                     group = groups[group_idx]
                 else:
-                    # Initialize a new group with unmatched keypoint
-                    group = _init_group()
-                    groups.append(group)
-
-                group.kpts[i] = locs_i[kpt_idx]
-                group.scores[i] = vals_i[kpt_idx]
-                group.tag_list.append(tags_i[kpt_idx])
+                    # if dists[kpt_idx].min() < 0.2:
+                    if False:
+                        group = None
+                    else:
+                        # Initialize a new group with unmatched keypoint
+                        group = _init_group()
+                        groups.append(group)
+                if group is not None:
+                    group.kpts[i] = locs_i[kpt_idx]
+                    group.scores[i] = vals_i[kpt_idx]
+                    group.tag_list.append(tags_i[kpt_idx])
+
+        # out = {
+        #     'idx': idx,
+        #     'i': i,
+        #     'costs': costs_copy,
+        #     'matches': matches,
+        #     'kpts': np.array([g.kpts for g in groups]),
+        #     'scores': np.array([g.scores for g in groups]),
+        #     'tag_list': [np.array(g.tag_list) for g in groups],
+        # }
+        # group_history.append(deepcopy(out))
+
+    # dump(group_history, 'group_history.pkl')
 
     groups = groups[:max_groups]
     if groups:
@@ -210,7 +242,7 @@ def __init__(
         decode_gaussian_kernel: int = 3,
         decode_keypoint_thr: float = 0.1,
         decode_tag_thr: float = 1.0,
-        decode_topk: int = 20,
+        decode_topk: int = 30,
         decode_max_instances: Optional[int] = None,
     ) -> None:
         super().__init__()
@@ -336,6 +368,12 @@ def _get_batch_topk(self, batch_heatmaps: Tensor, batch_tags: Tensor,
         B, K, H, W = batch_heatmaps.shape
         L = batch_tags.shape[1] // K
 
+        # Heatmap NMS
+        # dump(batch_heatmaps.cpu().numpy(), 'heatmaps.pkl')
+        batch_heatmaps = batch_heatmap_nms(batch_heatmaps,
+                                           self.decode_nms_kernel)
+        # dump(batch_heatmaps.cpu().numpy(), 'heatmaps_nms.pkl')
+
         # shape of topk_val, top_indices: (B, K, TopK)
         topk_vals, topk_indices = batch_heatmaps.flatten(-2, -1).topk(
             k, dim=-1)
@@ -433,9 +471,8 @@ def _fill_missing_keypoints(self, keypoints: np.ndarray,
                 cost_map = np.round(dist_map) * 100 - heatmaps[k]  # H, W
                 y, x = np.unravel_index(np.argmin(cost_map), shape=(H, W))
                 keypoints[n, k] = [x, y]
-                keypoint_scores[n, k] = heatmaps[k, y, x]
 
-        return keypoints, keypoint_scores
+        return keypoints
 
     def batch_decode(self, batch_heatmaps: Tensor, batch_tags: Tensor
                      ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
@@ -457,15 +494,12 @@ def batch_decode(self, batch_heatmaps: Tensor, batch_tags: Tensor
                 batch, each is in shape (N, K). It usually represents the
                 confidience of the keypoint prediction
         """
+
         B, _, H, W = batch_heatmaps.shape
         assert batch_tags.shape[0] == B and batch_tags.shape[2:4] == (H, W), (
             f'Mismatched shapes of heatmap ({batch_heatmaps.shape}) and '
             f'tagging map ({batch_tags.shape})')
 
-        # Heatmap NMS
-        batch_heatmaps = batch_heatmap_nms(batch_heatmaps,
-                                           self.decode_nms_kernel)
-
         # Get top-k in each heatmap and and convert to numpy
         batch_topk_vals, batch_topk_tags, batch_topk_locs = to_numpy(
             self._get_batch_topk(
@@ -489,7 +523,7 @@ def batch_decode(self, batch_heatmaps: Tensor, batch_tags: Tensor
 
             if keypoints.size > 0:
                 # identify missing keypoints
-                keypoints, scores = self._fill_missing_keypoints(
+                keypoints = self._fill_missing_keypoints(
                     keypoints, scores, heatmaps, tags)
 
                 # refine keypoint coordinates according to heatmap distribution
@@ -500,6 +534,14 @@ def batch_decode(self, batch_heatmaps: Tensor, batch_tags: Tensor
                         blur_kernel_size=self.decode_gaussian_kernel)
                 else:
                     keypoints = refine_keypoints(keypoints, heatmaps)
+                    # The following 0.5-pixel shift is adapted from mmpose 0.x
+                    # where the heatmap center is calculated by a biased
+                    # rounding ``mu=[int(x), int(y)]``. We keep this shift
+                    # operation for now to to compatible with 0.x checkpoints
+                    # In mmpose 1.x, AE heatmap center is calculated by the
+                    # unbiased rounding ``mu=[int(x+0.5), int(y+0.5)], so the
+                    # following shift will be removed in the future.
+                    keypoints += 0.5
 
             batch_keypoints.append(keypoints)
             batch_keypoint_scores.append(scores)
diff --git a/mmpose/datasets/transforms/bottomup_transforms.py b/mmpose/datasets/transforms/bottomup_transforms.py
index c31e0ae17d..1355d3359a 100644
--- a/mmpose/datasets/transforms/bottomup_transforms.py
+++ b/mmpose/datasets/transforms/bottomup_transforms.py
@@ -484,6 +484,7 @@ def transform(self, results: Dict) -> Optional[dict]:
                     output_size=actual_input_size)
             else:
                 center = np.array([img_w / 2, img_h / 2], dtype=np.float32)
+                # center = np.round(center)
                 scale = np.array([
                     img_w * padded_input_size[0] / actual_input_size[0],
                     img_h * padded_input_size[1] / actual_input_size[1]
@@ -495,11 +496,18 @@ def transform(self, results: Dict) -> Optional[dict]:
                     rot=0,
                     output_size=padded_input_size)
 
-            _img = cv2.warpAffine(
-                img, warp_mat, padded_input_size, flags=cv2.INTER_LINEAR)
+            _img = cv2.warpAffine(img, warp_mat, padded_input_size)
 
             imgs.append(_img)
 
+            # print('#' * 20)
+            # print('w,h: ', img_w, img_h, 'center: ', center, 'scale: ',
+            # scale,
+            #       'actual_input_size: ', actual_input_size,
+            #       'padded_input_size: ', padded_input_size)
+            # print(warp_mat)
+            # print('#' * 20)
+
             # Store the transform information w.r.t. the main input size
             if i == 0:
                 results['img_shape'] = padded_input_size[::-1]
diff --git a/mmpose/models/heads/heatmap_heads/ae_head.py b/mmpose/models/heads/heatmap_heads/ae_head.py
index bd12d57a33..451df0bbab 100644
--- a/mmpose/models/heads/heatmap_heads/ae_head.py
+++ b/mmpose/models/heads/heatmap_heads/ae_head.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Sequence, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 from mmengine.structures import PixelData
 from mmengine.utils import is_list_of
 from torch import Tensor
@@ -110,7 +111,7 @@ def predict(self,
             # TTA: multi-scale test
             assert is_list_of(feats, list if flip_test else tuple)
         else:
-            assert is_list_of(feats, tuple if flip_test else Tensor)
+            assert isinstance(feats, list if flip_test else tuple)
             feats = [feats]
 
         # resize heatmaps to align with with input size
@@ -129,6 +130,15 @@ def predict(self,
         for scale_idx, _feats in enumerate(feats):
             if not flip_test:
                 _heatmaps, _tags = self.forward(_feats)
+                if heatmap_size:
+                    _heatmaps = F.interpolate(
+                        _heatmaps, (img_h, img_w),
+                        mode='bilinear',
+                        align_corners=align_corners)
+                    _tags = F.interpolate(
+                        _tags, (img_h, img_w),
+                        mode='bilinear',
+                        align_corners=align_corners)
 
             else:
                 # TTA: flip test