fix alignment in readSourceFileToEndAlloc

gooncreeper · gooncreeper · commit 8c520aa91594 · 2025-07-20T15:23:59.000-04:00
Previously, it would return an align(2) slice as provided by
toOwnedSliceSentinel, which was problematic as the callee owns the
memory and would mistakingly free it as an align(1) slice.

The array list was originally changed to be align(2) to avoid
reallocation when converting UTF16LE to UTF8, however this changes it
back to align(1) and instead adds a function to std.unicode to convert
unaligned codepoints.
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
@@ -477,7 +477,7 @@ pub const Utf16LeIterator = struct {
     bytes: []const u8,
     i: usize,
 
-    pub fn init(s: []const u16) Utf16LeIterator {
+    pub fn init(s: []align(1) const u16) Utf16LeIterator {
         return Utf16LeIterator{
             .bytes = mem.sliceAsBytes(s),
             .i = 0,
@@ -917,7 +917,8 @@ test fmtUtf8 {
 
 fn utf16LeToUtf8ArrayListImpl(
     result: *std.ArrayList(u8),
-    utf16le: []const u16,
+    comptime alignment: std.mem.Alignment,
+    utf16le: []align(alignment.toByteUnits()) const u16,
     comptime surrogates: Surrogates,
 ) (switch (surrogates) {
     .cannot_encode_surrogate_half => Utf16LeToUtf8AllocError,
@@ -969,7 +970,7 @@ pub const Utf16LeToUtf8AllocError = mem.Allocator.Error || Utf16LeToUtf8Error;
 
 pub fn utf16LeToUtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void {
     try result.ensureUnusedCapacity(utf16le.len);
-    return utf16LeToUtf8ArrayListImpl(result, utf16le, .cannot_encode_surrogate_half);
+    return utf16LeToUtf8ArrayListImpl(result, .of(u16), utf16le, .cannot_encode_surrogate_half);
 }
 
 /// Caller must free returned memory.
@@ -978,17 +979,26 @@ pub fn utf16LeToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) Utf16L
     var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len);
     errdefer result.deinit();
 
-    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
+    try utf16LeToUtf8ArrayListImpl(&result, .of(u16), utf16le, .cannot_encode_surrogate_half);
     return result.toOwnedSlice();
 }
 
 /// Caller must free returned memory.
 pub fn utf16LeToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) Utf16LeToUtf8AllocError![:0]u8 {
+    return alignedUtf16LeToUtf8AllocZ(allocator, .of(u16), utf16le);
+}
+
+/// Caller must free returned memory.
+pub fn alignedUtf16LeToUtf8AllocZ(
+    allocator: mem.Allocator,
+    comptime alignment: mem.Alignment,
+    utf16le: []align(alignment.toByteUnits()) const u16,
+) Utf16LeToUtf8AllocError![:0]u8 {
     // optimistically guess that it will all be ascii (and allocate space for the null terminator)
     var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1);
     errdefer result.deinit();
 
-    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
+    try utf16LeToUtf8ArrayListImpl(&result, alignment, utf16le, .cannot_encode_surrogate_half);
     return result.toOwnedSliceSentinel(0);
 }
 
@@ -1752,7 +1762,7 @@ pub const Wtf8Iterator = struct {
 
 pub fn wtf16LeToWtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) mem.Allocator.Error!void {
     try result.ensureUnusedCapacity(utf16le.len);
-    return utf16LeToUtf8ArrayListImpl(result, utf16le, .can_encode_surrogate_half);
+    return utf16LeToUtf8ArrayListImpl(result, .of(u16), utf16le, .can_encode_surrogate_half);
 }
 
 /// Caller must free returned memory.
@@ -1761,7 +1771,7 @@ pub fn wtf16LeToWtf8Alloc(allocator: mem.Allocator, wtf16le: []const u16) mem.Al
     var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len);
     errdefer result.deinit();
 
-    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
+    try utf16LeToUtf8ArrayListImpl(&result, .of(u16), wtf16le, .can_encode_surrogate_half);
     return result.toOwnedSlice();
 }
 
@@ -1771,7 +1781,7 @@ pub fn wtf16LeToWtf8AllocZ(allocator: mem.Allocator, wtf16le: []const u16) mem.A
     var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len + 1);
     errdefer result.deinit();
 
-    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
+    try utf16LeToUtf8ArrayListImpl(&result, .of(u16), wtf16le, .can_encode_surrogate_half);
     return result.toOwnedSliceSentinel(0);
 }
 
@@ -1979,7 +1989,7 @@ pub const Wtf16LeIterator = struct {
     bytes: []const u8,
     i: usize,
 
-    pub fn init(s: []const u16) Wtf16LeIterator {
+    pub fn init(s: []align(1) const u16) Wtf16LeIterator {
         return Wtf16LeIterator{
             .bytes = mem.sliceAsBytes(s),
             .i = 0,
diff --git a/lib/std/zig.zig b/lib/std/zig.zig
@@ -534,7 +534,7 @@ test isUnderscore {
 }
 
 pub fn readSourceFileToEndAlloc(gpa: Allocator, file_reader: *std.fs.File.Reader) ![:0]u8 {
-    var buffer: std.ArrayListAlignedUnmanaged(u8, .@"2") = .empty;
+    var buffer: std.ArrayListUnmanaged(u8) = .empty;
     defer buffer.deinit(gpa);
 
     if (file_reader.getSize()) |size| {
@@ -543,7 +543,7 @@ pub fn readSourceFileToEndAlloc(gpa: Allocator, file_reader: *std.fs.File.Reader
         try buffer.ensureTotalCapacityPrecise(gpa, casted_size + 1);
     } else |_| {}
 
-    try file_reader.interface.appendRemaining(gpa, .@"2", &buffer, .limited(max_src_size));
+    try file_reader.interface.appendRemaining(gpa, null, &buffer, .limited(max_src_size));
 
     // Detect unsupported file types with their Byte Order Mark
     const unsupported_boms = [_][]const u8{
@@ -560,7 +560,7 @@ pub fn readSourceFileToEndAlloc(gpa: Allocator, file_reader: *std.fs.File.Reader
     // If the file starts with a UTF-16 little endian BOM, translate it to UTF-8
     if (std.mem.startsWith(u8, buffer.items, "\xff\xfe")) {
         if (buffer.items.len % 2 != 0) return error.InvalidEncoding;
-        return std.unicode.utf16LeToUtf8AllocZ(gpa, @ptrCast(buffer.items)) catch |err| switch (err) {
+        return std.unicode.alignedUtf16LeToUtf8AllocZ(gpa, .@"1", @ptrCast(buffer.items)) catch |err| switch (err) {
             error.DanglingSurrogateHalf => error.UnsupportedEncoding,
             error.ExpectedSecondSurrogateHalf => error.UnsupportedEncoding,
             error.UnexpectedSecondSurrogateHalf => error.UnsupportedEncoding,