From 2443943dacb49febb5b20fd624e016b610bb9c33 Mon Sep 17 00:00:00 2001 From: Jonathan 'theJPster' Pallant Date: Thu, 19 Dec 2024 19:22:00 +0000 Subject: [PATCH 1/5] Hand-roll assembly for chunky4 output. Now 640x480 @ 16 colours works on RP2040. --- src/vga/mod.rs | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/src/vga/mod.rs b/src/vga/mod.rs index c7e4bb4..3a020e6 100644 --- a/src/vga/mod.rs +++ b/src/vga/mod.rs @@ -465,13 +465,40 @@ impl RenderEngine { } } } else { - for col in 0..line_len_bytes { - unsafe { - let pixel_pair = line_start_bytes.add(col).read(); - let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair); - scan_line_buffer_ptr.write(pair); - scan_line_buffer_ptr = scan_line_buffer_ptr.add(1); - } + // // This code optimises poorly, leaving a load from the literal pool in the middle of the for loop. + // + // for col in 0..line_len_bytes { + // unsafe { + // let pixel_pair = line_start_bytes.add(col).read(); + // let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair); + // scan_line_buffer_ptr.write(pair); + // scan_line_buffer_ptr = scan_line_buffer_ptr.add(1); + // } + // } + + // So I wrote it by hand in assembly instead, saving two clock cycles per loop + unsafe { + core::arch::asm!( + "0:", + // load a byte from line_start_bytes + "ldrb {tmp}, [{lsb}]", + // multiply it by sizeof(u32) + "lsls {tmp}, {tmp}, #0x2", + // load a 32-bit word from CHUNKY4_COLOUR_LOOKUP[lsb] + "ldr {tmp}, [{chunky}, {tmp}]", + // store the 32-bit word to the scanline buffer, and increment + "stm {slbp}!, {{ {tmp} }}", + // increment the lsb + "adds {lsb}, {lsb}, #0x1", + // loop until we're done + "cmp {lsb}, {lsb_max}", + "bne 0b", + lsb = in(reg) line_start_bytes, + lsb_max = in(reg) line_start_bytes.add(line_len_bytes), + chunky = in(reg) core::ptr::addr_of!(CHUNKY4_COLOUR_LOOKUP), + tmp = in(reg) 0, + slbp = in(reg) scan_line_buffer_ptr, + ); } } } From 5c442a1d480c5d13cf8e3a859d62e005a7908d41 Mon Sep 17 00:00:00 2001 From: Jonathan 'theJPster' Pallant Date: Sat, 28 Dec 2024 17:03:07 +0000 Subject: [PATCH 2/5] Now supports 320x480 @ 256 colours. --- memory.x | 6 ++-- src/vga/mod.rs | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 3 deletions(-) diff --git a/memory.x b/memory.x index eba2c7a..adb7213 100644 --- a/memory.x +++ b/memory.x @@ -25,7 +25,7 @@ MEMORY { /* * This is the bottom of the four striped banks of SRAM in the RP2040. */ - RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9630 + RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9690 /* * This is the top of the four striped banks of SRAM in the RP2040, plus * SRAM_BANK4 and SRAM_BANK5. @@ -33,10 +33,10 @@ MEMORY { * This is carefully calculated to give us 8 KiB of stack space and ensure * the defmt buffer doesn't span across SRAM_BANK3 and SRAM_BANK4. * - * 0x9630 should be the (size of .data + size of .bss + size of .uninit + + * 0x9690 should be the (size of .data + size of .bss + size of .uninit + * 0x2000 for the stack). */ - RAM : ORIGIN = 0x20042000 - 0x9630, LENGTH = 0x9630 + RAM : ORIGIN = 0x20042000 - 0x9690, LENGTH = 0x9690 } /* diff --git a/src/vga/mod.rs b/src/vga/mod.rs index 3a020e6..a6d2a0a 100644 --- a/src/vga/mod.rs +++ b/src/vga/mod.rs @@ -232,6 +232,10 @@ impl RenderEngine { // Bitmap with 4 bits per pixel self.draw_next_line_chunky4(scan_line_buffer, current_line_num); } + neotron_common_bios::video::Format::Chunky8 => { + // Bitmap with 8 bits per pixel + self.draw_next_line_chunky8(scan_line_buffer, current_line_num); + } _ => { // Draw nothing } @@ -477,6 +481,7 @@ impl RenderEngine { // } // So I wrote it by hand in assembly instead, saving two clock cycles per loop + // We have 320x8 input and must produce 320x32 output unsafe { core::arch::asm!( "0:", @@ -503,6 +508,81 @@ impl RenderEngine { } } + /// Draw a line of 8-bpp bitmap as pixels. + /// + /// Writes into the relevant pixel buffer (either [`PIXEL_DATA_BUFFER_ODD`] + /// or [`PIXEL_DATA_BUFFER_EVEN`]) assuming the framebuffer is a bitmap. + /// + /// The `current_line_num` goes from `0..NUM_LINES`. + #[link_section = ".data"] + pub fn draw_next_line_chunky8(&mut self, scan_line_buffer: &LineBuffer, current_line_num: u16) { + let is_double = self.current_video_mode.is_horiz_2x(); + let base_ptr = self.current_video_ptr as *const u8; + let line_len_bytes = self.current_video_mode.line_size_bytes(); + let line_start_offset_bytes = usize::from(current_line_num) * line_len_bytes; + let line_start_bytes = unsafe { base_ptr.add(line_start_offset_bytes) }; + // Get a pointer into our scan-line buffer + let mut scan_line_buffer_ptr = scan_line_buffer.pixel_ptr(); + let palette_ptr = VIDEO_PALETTE.as_ptr() as *const RGBColour; + if is_double { + // Double-width mode. + // two RGB pixels (one pair) per byte + + // This code optimises poorly + // for col in 0..line_len_bytes { + // unsafe { + // let chunky_pixel = line_start_bytes.add(col).read() as usize; + // let rgb = palette_ptr.add(chunky_pixel).read(); + // scan_line_buffer_ptr.write(RGBPair::from_pixels(rgb, rgb)); + // scan_line_buffer_ptr = scan_line_buffer_ptr.add(1); + // } + // } + + // So I wrote it by hand in assembly instead, saving two clock cycles per loop + // We have 320x8 input and must produce 320x32 output + unsafe { + core::arch::asm!( + "0:", + // load a byte from line_start_bytes + "ldrb {tmp}, [{lsb}]", + // multiply it by sizeof(u16) + "lsls {tmp}, {tmp}, #0x1", + // load a 32-bit word from the palette + "ldrh {tmp}, [{palette}, {tmp}]", + // double it up + "lsls {tmp2}, {tmp}, #16", + "adds {tmp}, {tmp}, {tmp2}", + // store the 32-bit word to the scanline buffer, and increment + "stm {slbp}!, {{ {tmp} }}", + // increment the lsb + "adds {lsb}, {lsb}, #0x1", + // loop until we're done + "cmp {lsb}, {lsb_max}", + "bne 0b", + lsb = in(reg) line_start_bytes, + lsb_max = in(reg) line_start_bytes.add(line_len_bytes), + palette = in(reg) core::ptr::addr_of!(VIDEO_PALETTE), + tmp = in(reg) 0, + tmp2 = in(reg) 1, + slbp = in(reg) scan_line_buffer_ptr, + ); + } + } else { + // Single-width mode. + // one RGB pixel per byte + for col in 0..line_len_bytes / 2 { + unsafe { + let chunky_pixel_left = line_start_bytes.add(col * 2).read() as usize; + let rgb_left = palette_ptr.add(chunky_pixel_left).read(); + let chunky_pixel_right = line_start_bytes.add((col * 2) + 1).read() as usize; + let rgb_right = palette_ptr.add(chunky_pixel_right).read(); + scan_line_buffer_ptr.write(RGBPair::from_pixels(rgb_left, rgb_right)); + scan_line_buffer_ptr = scan_line_buffer_ptr.add(1); + } + } + } + } + /// Draw a line text as pixels. /// /// Writes into the relevant pixel buffer (either [`PIXEL_DATA_BUFFER_ODD`] @@ -2010,6 +2090,12 @@ pub fn test_video_mode(mode: neotron_common_bios::video::Mode) -> bool { | neotron_common_bios::video::Format::Chunky4, true, false, + ) | ( + neotron_common_bios::video::Timing::T640x480 + | neotron_common_bios::video::Timing::T640x400, + neotron_common_bios::video::Format::Chunky8, + true, + false ) ) } From 1780e8fdc51b970a0254c153c77a4871c85e24b8 Mon Sep 17 00:00:00 2001 From: Jonathan 'theJPster' Pallant Date: Sat, 28 Dec 2024 17:03:30 +0000 Subject: [PATCH 3/5] Fix colours in Mode 15 and Mode 31. --- src/vga/mod.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/vga/mod.rs b/src/vga/mod.rs index a6d2a0a..44aa96f 100644 --- a/src/vga/mod.rs +++ b/src/vga/mod.rs @@ -257,9 +257,13 @@ impl RenderEngine { let line_start = unsafe { base_ptr.add(offset) }; // Get a pointer into our scan-line buffer let mut scan_line_buffer_ptr = scan_line_buffer.pixel_ptr(); - let black_pixel = RGBColour(VIDEO_PALETTE[0].load(Ordering::Relaxed)); - let white_pixel = RGBColour(VIDEO_PALETTE[1].load(Ordering::Relaxed)); if is_double { + let white_pixel = RGBColour( + VIDEO_PALETTE[TextForegroundColour::White as usize].load(Ordering::Relaxed), + ); + let black_pixel = RGBColour( + VIDEO_PALETTE[TextForegroundColour::Black as usize].load(Ordering::Relaxed), + ); // double-width mode. // sixteen RGB pixels (eight pairs) per byte let white_pair = RGBPair::from_pixels(white_pixel, white_pixel); From 4beca78004f4e730e79de2f9b371d5b58d5bde30 Mon Sep 17 00:00:00 2001 From: Jonathan 'theJPster' Pallant Date: Sat, 28 Dec 2024 18:05:53 +0000 Subject: [PATCH 4/5] Remove unused function --- src/vga/mod.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/vga/mod.rs b/src/vga/mod.rs index 44aa96f..ab682e8 100644 --- a/src/vga/mod.rs +++ b/src/vga/mod.rs @@ -1168,14 +1168,6 @@ impl Chunky4ColourLookup { } } } - - /// Turn a pair of chunky4 pixels (in a `u8`), into a pair of RGB pixels. - #[inline] - fn lookup(&self, pixel_pair: u8) -> RGBPair { - let index = usize::from(pixel_pair); - let raw = self.entries[index].load(Ordering::Relaxed); - RGBPair(raw) - } } // ----------------------------------------------------------------------------- From b6e627e91046f04d9e657f7df0ce2353f9d71482 Mon Sep 17 00:00:00 2001 From: Jonathan 'theJPster' Pallant Date: Sun, 29 Dec 2024 18:22:20 +0000 Subject: [PATCH 5/5] Apply suggestions from code review Tweaking some comments on the assembly routines --- src/vga/mod.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/vga/mod.rs b/src/vga/mod.rs index ab682e8..3a250d1 100644 --- a/src/vga/mod.rs +++ b/src/vga/mod.rs @@ -485,7 +485,7 @@ impl RenderEngine { // } // So I wrote it by hand in assembly instead, saving two clock cycles per loop - // We have 320x8 input and must produce 320x32 output + // We have 640x4 (320x8) input and must produce 320x32 output unsafe { core::arch::asm!( "0:", @@ -493,11 +493,11 @@ impl RenderEngine { "ldrb {tmp}, [{lsb}]", // multiply it by sizeof(u32) "lsls {tmp}, {tmp}, #0x2", - // load a 32-bit word from CHUNKY4_COLOUR_LOOKUP[lsb] + // load a 32-bit RGB pair from CHUNKY4_COLOUR_LOOKUP "ldr {tmp}, [{chunky}, {tmp}]", - // store the 32-bit word to the scanline buffer, and increment + // store the 32-bit RGB pair to the scanline buffer, and increment "stm {slbp}!, {{ {tmp} }}", - // increment the lsb + // increment the pointer to the start of the line "adds {lsb}, {lsb}, #0x1", // loop until we're done "cmp {lsb}, {lsb_max}", @@ -551,14 +551,14 @@ impl RenderEngine { "ldrb {tmp}, [{lsb}]", // multiply it by sizeof(u16) "lsls {tmp}, {tmp}, #0x1", - // load a 32-bit word from the palette + // load a single 16-bit RGB value from the palette "ldrh {tmp}, [{palette}, {tmp}]", - // double it up + // double it up to make a 32-bit RGB pair containing two identical pixels "lsls {tmp2}, {tmp}, #16", "adds {tmp}, {tmp}, {tmp2}", - // store the 32-bit word to the scanline buffer, and increment + // store the 32-bit RGB pair to the scanline buffer, and increment "stm {slbp}!, {{ {tmp} }}", - // increment the lsb + // increment the pointer to the start of the line "adds {lsb}, {lsb}, #0x1", // loop until we're done "cmp {lsb}, {lsb_max}", @@ -572,7 +572,7 @@ impl RenderEngine { ); } } else { - // Single-width mode. + // Single-width mode. This won't run fast enough on an RP2040, but no supported mode uses it. // one RGB pixel per byte for col in 0..line_len_bytes / 2 { unsafe {