|
| 1 | +fork: defer linking file vma until vma is fully initialized |
| 2 | + |
| 3 | +jira LE-1907 |
| 4 | +cve CVE-2024-27022 |
| 5 | +Rebuild_History Non-Buildable kernel-5.14.0-427.37.1.el9_4 |
| 6 | +commit-author Miaohe Lin <linmiaohe@huawei.com> |
| 7 | +commit 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 |
| 8 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 9 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 10 | +ciq/ciq_backports/kernel-5.14.0-427.37.1.el9_4/35e35178.failed |
| 11 | + |
| 12 | +Thorvald reported a WARNING [1]. And the root cause is below race: |
| 13 | + |
| 14 | + CPU 1 CPU 2 |
| 15 | + fork hugetlbfs_fallocate |
| 16 | + dup_mmap hugetlbfs_punch_hole |
| 17 | + i_mmap_lock_write(mapping); |
| 18 | + vma_interval_tree_insert_after -- Child vma is visible through i_mmap tree. |
| 19 | + i_mmap_unlock_write(mapping); |
| 20 | + hugetlb_dup_vma_private -- Clear vma_lock outside i_mmap_rwsem! |
| 21 | + i_mmap_lock_write(mapping); |
| 22 | + hugetlb_vmdelete_list |
| 23 | + vma_interval_tree_foreach |
| 24 | + hugetlb_vma_trylock_write -- Vma_lock is cleared. |
| 25 | + tmp->vm_ops->open -- Alloc new vma_lock outside i_mmap_rwsem! |
| 26 | + hugetlb_vma_unlock_write -- Vma_lock is assigned!!! |
| 27 | + i_mmap_unlock_write(mapping); |
| 28 | + |
| 29 | +hugetlb_dup_vma_private() and hugetlb_vm_op_open() are called outside |
| 30 | +i_mmap_rwsem lock while vma lock can be used in the same time. Fix this |
| 31 | +by deferring linking file vma until vma is fully initialized. Those vmas |
| 32 | +should be initialized first before they can be used. |
| 33 | + |
| 34 | +Link: https://lkml.kernel.org/r/20240410091441.3539905-1-linmiaohe@huawei.com |
| 35 | +Fixes: 8d9bfb260814 ("hugetlb: add vma based lock for pmd sharing") |
| 36 | + Signed-off-by: Miaohe Lin <linmiaohe@huawei.com> |
| 37 | + Reported-by: Thorvald Natvig <thorvald@google.com> |
| 38 | +Closes: https://lore.kernel.org/linux-mm/20240129161735.6gmjsswx62o4pbja@revolver/T/ [1] |
| 39 | + Reviewed-by: Jane Chu <jane.chu@oracle.com> |
| 40 | + Cc: Christian Brauner <brauner@kernel.org> |
| 41 | + Cc: Heiko Carstens <hca@linux.ibm.com> |
| 42 | + Cc: Kent Overstreet <kent.overstreet@linux.dev> |
| 43 | + Cc: Liam R. Howlett <Liam.Howlett@oracle.com> |
| 44 | + Cc: Mateusz Guzik <mjguzik@gmail.com> |
| 45 | + Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| 46 | + Cc: Miaohe Lin <linmiaohe@huawei.com> |
| 47 | + Cc: Muchun Song <muchun.song@linux.dev> |
| 48 | + Cc: Oleg Nesterov <oleg@redhat.com> |
| 49 | + Cc: Peng Zhang <zhangpeng.00@bytedance.com> |
| 50 | + Cc: Tycho Andersen <tandersen@netflix.com> |
| 51 | + Cc: <stable@vger.kernel.org> |
| 52 | + Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| 53 | +(cherry picked from commit 35e351780fa9d8240dd6f7e4f245f9ea37e96c19) |
| 54 | + Signed-off-by: Jonathan Maple <jmaple@ciq.com> |
| 55 | + |
| 56 | +# Conflicts: |
| 57 | +# kernel/fork.c |
| 58 | +diff --cc kernel/fork.c |
| 59 | +index a97c37970134,aebb3e6c96dc..000000000000 |
| 60 | +--- a/kernel/fork.c |
| 61 | ++++ b/kernel/fork.c |
| 62 | +@@@ -662,7 -713,24 +662,28 @@@ static __latent_entropy int dup_mmap(st |
| 63 | + tmp->anon_vma = NULL; |
| 64 | + } else if (anon_vma_fork(tmp, mpnt)) |
| 65 | + goto fail_nomem_anon_vma_fork; |
| 66 | +++<<<<<<< HEAD |
| 67 | + + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); |
| 68 | +++======= |
| 69 | ++ vm_flags_clear(tmp, VM_LOCKED_MASK); |
| 70 | ++ /* |
| 71 | ++ * Copy/update hugetlb private vma information. |
| 72 | ++ */ |
| 73 | ++ if (is_vm_hugetlb_page(tmp)) |
| 74 | ++ hugetlb_dup_vma_private(tmp); |
| 75 | ++ |
| 76 | ++ /* |
| 77 | ++ * Link the vma into the MT. After using __mt_dup(), memory |
| 78 | ++ * allocation is not necessary here, so it cannot fail. |
| 79 | ++ */ |
| 80 | ++ vma_iter_bulk_store(&vmi, tmp); |
| 81 | ++ |
| 82 | ++ mm->map_count++; |
| 83 | ++ |
| 84 | ++ if (tmp->vm_ops && tmp->vm_ops->open) |
| 85 | ++ tmp->vm_ops->open(tmp); |
| 86 | ++ |
| 87 | +++>>>>>>> 35e351780fa9 (fork: defer linking file vma until vma is fully initialized) |
| 88 | + file = tmp->vm_file; |
| 89 | + if (file) { |
| 90 | + struct address_space *mapping = file->f_mapping; |
| 91 | +@@@ -679,33 -747,13 +700,43 @@@ |
| 92 | + i_mmap_unlock_write(mapping); |
| 93 | + } |
| 94 | + |
| 95 | +++<<<<<<< HEAD |
| 96 | + + /* |
| 97 | + + * Copy/update hugetlb private vma information. |
| 98 | + + */ |
| 99 | + + if (is_vm_hugetlb_page(tmp)) |
| 100 | + + hugetlb_dup_vma_private(tmp); |
| 101 | + + |
| 102 | + + /* |
| 103 | + + * Link in the new vma and copy the page table entries. |
| 104 | + + */ |
| 105 | + + *pprev = tmp; |
| 106 | + + pprev = &tmp->vm_next; |
| 107 | + + tmp->vm_prev = prev; |
| 108 | + + prev = tmp; |
| 109 | + + |
| 110 | + + __vma_link_rb(mm, tmp, rb_link, rb_parent); |
| 111 | + + rb_link = &tmp->vm_rb.rb_right; |
| 112 | + + rb_parent = &tmp->vm_rb; |
| 113 | + + |
| 114 | + + mm->map_count++; |
| 115 | + + if (!(tmp->vm_flags & VM_WIPEONFORK)) |
| 116 | + + retval = copy_page_range(tmp, mpnt); |
| 117 | + + |
| 118 | + + if (tmp->vm_ops && tmp->vm_ops->open) |
| 119 | + + tmp->vm_ops->open(tmp); |
| 120 | + + |
| 121 | + + if (retval) |
| 122 | + + goto out; |
| 123 | +++======= |
| 124 | ++ if (!(tmp->vm_flags & VM_WIPEONFORK)) |
| 125 | ++ retval = copy_page_range(tmp, mpnt); |
| 126 | ++ |
| 127 | ++ if (retval) { |
| 128 | ++ mpnt = vma_next(&vmi); |
| 129 | ++ goto loop_out; |
| 130 | ++ } |
| 131 | +++>>>>>>> 35e351780fa9 (fork: defer linking file vma until vma is fully initialized) |
| 132 | + } |
| 133 | + /* a new mm has just been created */ |
| 134 | + retval = arch_dup_mmap(oldmm, mm); |
| 135 | +* Unmerged path kernel/fork.c |
0 commit comments