@@ -884,13 +884,13 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
884884 * Perform checks before attempting to write a VMA prior to it being
885885 * moved.
886886 */
887- static unsigned long prep_move_vma (struct vma_remap_struct * vrm ,
888- unsigned long * vm_flags_ptr )
887+ static unsigned long prep_move_vma (struct vma_remap_struct * vrm )
889888{
890889 unsigned long err = 0 ;
891890 struct vm_area_struct * vma = vrm -> vma ;
892891 unsigned long old_addr = vrm -> addr ;
893892 unsigned long old_len = vrm -> old_len ;
893+ unsigned long dummy = vma -> vm_flags ;
894894
895895 /*
896896 * We'd prefer to avoid failure later on in do_munmap:
@@ -916,85 +916,236 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm,
916916 * so KSM can come around to merge on vma and new_vma afterwards.
917917 */
918918 err = ksm_madvise (vma , old_addr , old_addr + old_len ,
919- MADV_UNMERGEABLE , vm_flags_ptr );
919+ MADV_UNMERGEABLE , & dummy );
920920 if (err )
921921 return err ;
922922
923923 return 0 ;
924924}
925925
926- static unsigned long move_vma (struct vma_remap_struct * vrm )
926+ /*
927+ * Unmap source VMA for VMA move, turning it from a copy to a move, being
928+ * careful to ensure we do not underflow memory account while doing so if an
929+ * accountable move.
930+ *
931+ * This is best effort, if we fail to unmap then we simply try to correct
932+ * accounting and exit.
933+ */
934+ static void unmap_source_vma (struct vma_remap_struct * vrm )
927935{
928936 struct mm_struct * mm = current -> mm ;
937+ unsigned long addr = vrm -> addr ;
938+ unsigned long len = vrm -> old_len ;
929939 struct vm_area_struct * vma = vrm -> vma ;
930- struct vm_area_struct * new_vma ;
931- unsigned long vm_flags = vma -> vm_flags ;
932- unsigned long old_addr = vrm -> addr , new_addr = vrm -> new_addr ;
933- unsigned long old_len = vrm -> old_len , new_len = vrm -> new_len ;
934- unsigned long new_pgoff ;
935- unsigned long moved_len ;
936- unsigned long account_start = false;
937- unsigned long account_end = false;
938- unsigned long hiwater_vm ;
940+ VMA_ITERATOR (vmi , mm , addr );
939941 int err ;
940- bool need_rmap_locks ;
941- struct vma_iterator vmi ;
942+ unsigned long vm_start ;
943+ unsigned long vm_end ;
944+ /*
945+ * It might seem odd that we check for MREMAP_DONTUNMAP here, given this
946+ * function implies that we unmap the original VMA, which seems
947+ * contradictory.
948+ *
949+ * However, this occurs when this operation was attempted and an error
950+ * arose, in which case we _do_ wish to unmap the _new_ VMA, which means
951+ * we actually _do_ want it be unaccounted.
952+ */
953+ bool accountable_move = (vma -> vm_flags & VM_ACCOUNT ) &&
954+ !(vrm -> flags & MREMAP_DONTUNMAP );
942955
943- err = prep_move_vma (vrm , & vm_flags );
944- if (err )
945- return err ;
956+ /*
957+ * So we perform a trick here to prevent incorrect accounting. Any merge
958+ * or new VMA allocation performed in copy_vma() does not adjust
959+ * accounting, it is expected that callers handle this.
960+ *
961+ * And indeed we already have, accounting appropriately in the case of
962+ * both in vrm_charge().
963+ *
964+ * However, when we unmap the existing VMA (to effect the move), this
965+ * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
966+ * removed pages.
967+ *
968+ * To avoid this we temporarily clear this flag, reinstating on any
969+ * portions of the original VMA that remain.
970+ */
971+ if (accountable_move ) {
972+ vm_flags_clear (vma , VM_ACCOUNT );
973+ /* We are about to split vma, so store the start/end. */
974+ vm_start = vma -> vm_start ;
975+ vm_end = vma -> vm_end ;
976+ }
946977
947- /* If accounted, charge the number of bytes the operation will use. */
948- if (!vrm_charge (vrm ))
949- return - ENOMEM ;
978+ err = do_vmi_munmap (& vmi , mm , addr , len , vrm -> uf_unmap , /* unlock= */ false);
979+ vrm -> vma = NULL ; /* Invalidated. */
980+ if (err ) {
981+ /* OOM: unable to split vma, just get accounts right */
982+ vm_acct_memory (len >> PAGE_SHIFT );
983+ return ;
984+ }
950985
951- vma_start_write (vma );
952- new_pgoff = vma -> vm_pgoff + ((old_addr - vma -> vm_start ) >> PAGE_SHIFT );
953- new_vma = copy_vma (& vrm -> vma , new_addr , new_len , new_pgoff ,
986+ /*
987+ * If we mremap() from a VMA like this:
988+ *
989+ * addr end
990+ * | |
991+ * v v
992+ * |-------------|
993+ * | |
994+ * |-------------|
995+ *
996+ * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
997+ * we'll end up with:
998+ *
999+ * addr end
1000+ * | |
1001+ * v v
1002+ * |---| |---|
1003+ * | A | | B |
1004+ * |---| |---|
1005+ *
1006+ * The VMI is still pointing at addr, so vma_prev() will give us A, and
1007+ * a subsequent or lone vma_next() will give as B.
1008+ *
1009+ * do_vmi_munmap() will have restored the VMI back to addr.
1010+ */
1011+ if (accountable_move ) {
1012+ unsigned long end = addr + len ;
1013+
1014+ if (vm_start < addr ) {
1015+ struct vm_area_struct * prev = vma_prev (& vmi );
1016+
1017+ vm_flags_set (prev , VM_ACCOUNT ); /* Acquires VMA lock. */
1018+ }
1019+
1020+ if (vm_end > end ) {
1021+ struct vm_area_struct * next = vma_next (& vmi );
1022+
1023+ vm_flags_set (next , VM_ACCOUNT ); /* Acquires VMA lock. */
1024+ }
1025+ }
1026+ }
1027+
1028+ /*
1029+ * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
1030+ * process. Additionally handle an error occurring on moving of page tables,
1031+ * where we reset vrm state to cause unmapping of the new VMA.
1032+ *
1033+ * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
1034+ * error code.
1035+ */
1036+ static int copy_vma_and_data (struct vma_remap_struct * vrm ,
1037+ struct vm_area_struct * * new_vma_ptr )
1038+ {
1039+ unsigned long internal_offset = vrm -> addr - vrm -> vma -> vm_start ;
1040+ unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT ;
1041+ unsigned long new_pgoff = vrm -> vma -> vm_pgoff + internal_pgoff ;
1042+ unsigned long moved_len ;
1043+ bool need_rmap_locks ;
1044+ struct vm_area_struct * vma ;
1045+ struct vm_area_struct * new_vma ;
1046+ int err = 0 ;
1047+
1048+ new_vma = copy_vma (& vrm -> vma , vrm -> new_addr , vrm -> new_len , new_pgoff ,
9541049 & need_rmap_locks );
955- /* This may have been updated. */
956- vma = vrm -> vma ;
9571050 if (!new_vma ) {
9581051 vrm_uncharge (vrm );
1052+ * new_vma_ptr = NULL ;
9591053 return - ENOMEM ;
9601054 }
1055+ vma = vrm -> vma ;
9611056
962- moved_len = move_page_tables (vma , old_addr , new_vma , new_addr , old_len ,
963- need_rmap_locks , false);
964- if (moved_len < old_len ) {
1057+ moved_len = move_page_tables (vma , vrm -> addr , new_vma ,
1058+ vrm -> new_addr , vrm -> old_len ,
1059+ need_rmap_locks , /* for_stack= */ false);
1060+ if (moved_len < vrm -> old_len )
9651061 err = - ENOMEM ;
966- } else if (vma -> vm_ops && vma -> vm_ops -> mremap ) {
1062+ else if (vma -> vm_ops && vma -> vm_ops -> mremap )
9671063 err = vma -> vm_ops -> mremap (new_vma );
968- }
9691064
9701065 if (unlikely (err )) {
9711066 /*
9721067 * On error, move entries back from new area to old,
9731068 * which will succeed since page tables still there,
9741069 * and then proceed to unmap new area instead of old.
9751070 */
976- move_page_tables (new_vma , new_addr , vma , old_addr , moved_len ,
977- true, false);
978- vma = new_vma ;
979- old_len = new_len ;
980- old_addr = new_addr ;
981- new_addr = err ;
1071+ move_page_tables (new_vma , vrm -> new_addr , vma , vrm -> addr ,
1072+ moved_len , /* need_rmap_locks = */ true,
1073+ /* for_stack= */ false) ;
1074+ vrm -> vma = new_vma ;
1075+ vrm -> old_len = vrm -> new_len ;
1076+ vrm -> addr = vrm -> new_addr ;
9821077 } else {
9831078 mremap_userfaultfd_prep (new_vma , vrm -> uf );
9841079 }
9851080
986- if (is_vm_hugetlb_page (vma )) {
1081+ if (is_vm_hugetlb_page (vma ))
9871082 clear_vma_resv_huge_pages (vma );
988- }
9891083
990- /* Conceal VM_ACCOUNT so old reservation is not undone */
991- if (vm_flags & VM_ACCOUNT && !(vrm -> flags & MREMAP_DONTUNMAP )) {
992- vm_flags_clear (vma , VM_ACCOUNT );
993- if (vma -> vm_start < old_addr )
994- account_start = true;
995- if (vma -> vm_end > old_addr + old_len )
996- account_end = true;
997- }
1084+ /* Tell pfnmap has moved from this vma */
1085+ if (unlikely (vma -> vm_flags & VM_PFNMAP ))
1086+ untrack_pfn_clear (vma );
1087+
1088+ * new_vma_ptr = new_vma ;
1089+ return err ;
1090+ }
1091+
1092+ /*
1093+ * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and
1094+ * account flags on remaining VMA by convention (it cannot be mlock()'d any
1095+ * longer, as pages in range are no longer mapped), and removing anon_vma_chain
1096+ * links from it (if the entire VMA was copied over).
1097+ */
1098+ static void dontunmap_complete (struct vma_remap_struct * vrm ,
1099+ struct vm_area_struct * new_vma )
1100+ {
1101+ unsigned long start = vrm -> addr ;
1102+ unsigned long end = vrm -> addr + vrm -> old_len ;
1103+ unsigned long old_start = vrm -> vma -> vm_start ;
1104+ unsigned long old_end = vrm -> vma -> vm_end ;
1105+
1106+ /*
1107+ * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old
1108+ * vma.
1109+ */
1110+ vm_flags_clear (vrm -> vma , VM_LOCKED_MASK | VM_ACCOUNT );
1111+
1112+ /*
1113+ * anon_vma links of the old vma is no longer needed after its page
1114+ * table has been moved.
1115+ */
1116+ if (new_vma != vrm -> vma && start == old_start && end == old_end )
1117+ unlink_anon_vmas (vrm -> vma );
1118+
1119+ /* Because we won't unmap we don't need to touch locked_vm. */
1120+ }
1121+
1122+ static unsigned long move_vma (struct vma_remap_struct * vrm )
1123+ {
1124+ struct mm_struct * mm = current -> mm ;
1125+ struct vm_area_struct * new_vma ;
1126+ unsigned long hiwater_vm ;
1127+ int err ;
1128+
1129+ err = prep_move_vma (vrm );
1130+ if (err )
1131+ return err ;
1132+
1133+ /* If accounted, charge the number of bytes the operation will use. */
1134+ if (!vrm_charge (vrm ))
1135+ return - ENOMEM ;
1136+
1137+ /* We don't want racing faults. */
1138+ vma_start_write (vrm -> vma );
1139+
1140+ /* Perform copy step. */
1141+ err = copy_vma_and_data (vrm , & new_vma );
1142+ /*
1143+ * If we established the copied-to VMA, we attempt to recover from the
1144+ * error by setting the destination VMA to the source VMA and unmapping
1145+ * it below.
1146+ */
1147+ if (err && !new_vma )
1148+ return err ;
9981149
9991150 /*
10001151 * If we failed to move page tables we still do total_vm increment
@@ -1007,51 +1158,15 @@ static unsigned long move_vma(struct vma_remap_struct *vrm)
10071158 */
10081159 hiwater_vm = mm -> hiwater_vm ;
10091160
1010- /* Tell pfnmap has moved from this vma */
1011- if (unlikely (vma -> vm_flags & VM_PFNMAP ))
1012- untrack_pfn_clear (vma );
1013-
1014- if (unlikely (!err && (vrm -> flags & MREMAP_DONTUNMAP ))) {
1015- /* We always clear VM_LOCKED[ONFAULT] on the old vma */
1016- vm_flags_clear (vma , VM_LOCKED_MASK );
1017-
1018- /*
1019- * anon_vma links of the old vma is no longer needed after its page
1020- * table has been moved.
1021- */
1022- if (new_vma != vma && vma -> vm_start == old_addr &&
1023- vma -> vm_end == (old_addr + old_len ))
1024- unlink_anon_vmas (vma );
1025-
1026- /* Because we won't unmap we don't need to touch locked_vm */
1027- vrm_stat_account (vrm , new_len );
1028- return new_addr ;
1029- }
1030-
1031- vrm_stat_account (vrm , new_len );
1032-
1033- vma_iter_init (& vmi , mm , old_addr );
1034- if (do_vmi_munmap (& vmi , mm , old_addr , old_len , vrm -> uf_unmap , false) < 0 ) {
1035- /* OOM: unable to split vma, just get accounts right */
1036- if (vm_flags & VM_ACCOUNT && !(vrm -> flags & MREMAP_DONTUNMAP ))
1037- vm_acct_memory (old_len >> PAGE_SHIFT );
1038- account_start = account_end = false;
1039- }
1161+ vrm_stat_account (vrm , vrm -> new_len );
1162+ if (unlikely (!err && (vrm -> flags & MREMAP_DONTUNMAP )))
1163+ dontunmap_complete (vrm , new_vma );
1164+ else
1165+ unmap_source_vma (vrm );
10401166
10411167 mm -> hiwater_vm = hiwater_vm ;
10421168
1043- /* Restore VM_ACCOUNT if one or two pieces of vma left */
1044- if (account_start ) {
1045- vma = vma_prev (& vmi );
1046- vm_flags_set (vma , VM_ACCOUNT );
1047- }
1048-
1049- if (account_end ) {
1050- vma = vma_next (& vmi );
1051- vm_flags_set (vma , VM_ACCOUNT );
1052- }
1053-
1054- return new_addr ;
1169+ return err ? (unsigned long )err : vrm -> new_addr ;
10551170}
10561171
10571172/*
0 commit comments