Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions memory.x
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@ MEMORY {
/*
* This is the bottom of the four striped banks of SRAM in the RP2040.
*/
RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9630
RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9690
/*
* This is the top of the four striped banks of SRAM in the RP2040, plus
* SRAM_BANK4 and SRAM_BANK5.
*
* This is carefully calculated to give us 8 KiB of stack space and ensure
* the defmt buffer doesn't span across SRAM_BANK3 and SRAM_BANK4.
*
* 0x9630 should be the (size of .data + size of .bss + size of .uninit +
* 0x9690 should be the (size of .data + size of .bss + size of .uninit +
* 0x2000 for the stack).
*/
RAM : ORIGIN = 0x20042000 - 0x9630, LENGTH = 0x9630
RAM : ORIGIN = 0x20042000 - 0x9690, LENGTH = 0x9690
}

/*
Expand Down
137 changes: 123 additions & 14 deletions src/vga/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ impl RenderEngine {
// Bitmap with 4 bits per pixel
self.draw_next_line_chunky4(scan_line_buffer, current_line_num);
}
neotron_common_bios::video::Format::Chunky8 => {
// Bitmap with 8 bits per pixel
self.draw_next_line_chunky8(scan_line_buffer, current_line_num);
}
_ => {
// Draw nothing
}
Expand All @@ -253,9 +257,13 @@ impl RenderEngine {
let line_start = unsafe { base_ptr.add(offset) };
// Get a pointer into our scan-line buffer
let mut scan_line_buffer_ptr = scan_line_buffer.pixel_ptr();
let black_pixel = RGBColour(VIDEO_PALETTE[0].load(Ordering::Relaxed));
let white_pixel = RGBColour(VIDEO_PALETTE[1].load(Ordering::Relaxed));
if is_double {
let white_pixel = RGBColour(
VIDEO_PALETTE[TextForegroundColour::White as usize].load(Ordering::Relaxed),
);
let black_pixel = RGBColour(
VIDEO_PALETTE[TextForegroundColour::Black as usize].load(Ordering::Relaxed),
);
// double-width mode.
// sixteen RGB pixels (eight pairs) per byte
let white_pair = RGBPair::from_pixels(white_pixel, white_pixel);
Expand Down Expand Up @@ -465,11 +473,114 @@ impl RenderEngine {
}
}
} else {
for col in 0..line_len_bytes {
// // This code optimises poorly, leaving a load from the literal pool in the middle of the for loop.
//
// for col in 0..line_len_bytes {
// unsafe {
// let pixel_pair = line_start_bytes.add(col).read();
// let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
// scan_line_buffer_ptr.write(pair);
// scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
// }
// }

// So I wrote it by hand in assembly instead, saving two clock cycles per loop
// We have 640x4 (320x8) input and must produce 320x32 output
unsafe {
core::arch::asm!(
"0:",
// load a byte from line_start_bytes
"ldrb {tmp}, [{lsb}]",
// multiply it by sizeof(u32)
"lsls {tmp}, {tmp}, #0x2",
// load a 32-bit RGB pair from CHUNKY4_COLOUR_LOOKUP
"ldr {tmp}, [{chunky}, {tmp}]",
// store the 32-bit RGB pair to the scanline buffer, and increment
"stm {slbp}!, {{ {tmp} }}",
// increment the pointer to the start of the line
"adds {lsb}, {lsb}, #0x1",
// loop until we're done
"cmp {lsb}, {lsb_max}",
"bne 0b",
lsb = in(reg) line_start_bytes,
lsb_max = in(reg) line_start_bytes.add(line_len_bytes),
chunky = in(reg) core::ptr::addr_of!(CHUNKY4_COLOUR_LOOKUP),
tmp = in(reg) 0,
slbp = in(reg) scan_line_buffer_ptr,
);
}
}
}

/// Draw a line of 8-bpp bitmap as pixels.
///
/// Writes into the relevant pixel buffer (either [`PIXEL_DATA_BUFFER_ODD`]
/// or [`PIXEL_DATA_BUFFER_EVEN`]) assuming the framebuffer is a bitmap.
///
/// The `current_line_num` goes from `0..NUM_LINES`.
#[link_section = ".data"]
pub fn draw_next_line_chunky8(&mut self, scan_line_buffer: &LineBuffer, current_line_num: u16) {
let is_double = self.current_video_mode.is_horiz_2x();
let base_ptr = self.current_video_ptr as *const u8;
let line_len_bytes = self.current_video_mode.line_size_bytes();
let line_start_offset_bytes = usize::from(current_line_num) * line_len_bytes;
let line_start_bytes = unsafe { base_ptr.add(line_start_offset_bytes) };
// Get a pointer into our scan-line buffer
let mut scan_line_buffer_ptr = scan_line_buffer.pixel_ptr();
let palette_ptr = VIDEO_PALETTE.as_ptr() as *const RGBColour;
if is_double {
// Double-width mode.
// two RGB pixels (one pair) per byte

// This code optimises poorly
// for col in 0..line_len_bytes {
// unsafe {
// let chunky_pixel = line_start_bytes.add(col).read() as usize;
// let rgb = palette_ptr.add(chunky_pixel).read();
// scan_line_buffer_ptr.write(RGBPair::from_pixels(rgb, rgb));
// scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
// }
// }

// So I wrote it by hand in assembly instead, saving two clock cycles per loop
// We have 320x8 input and must produce 320x32 output
unsafe {
core::arch::asm!(
"0:",
// load a byte from line_start_bytes
"ldrb {tmp}, [{lsb}]",
// multiply it by sizeof(u16)
"lsls {tmp}, {tmp}, #0x1",
// load a single 16-bit RGB value from the palette
"ldrh {tmp}, [{palette}, {tmp}]",
// double it up to make a 32-bit RGB pair containing two identical pixels
"lsls {tmp2}, {tmp}, #16",
"adds {tmp}, {tmp}, {tmp2}",
// store the 32-bit RGB pair to the scanline buffer, and increment
"stm {slbp}!, {{ {tmp} }}",
// increment the pointer to the start of the line
"adds {lsb}, {lsb}, #0x1",
// loop until we're done
"cmp {lsb}, {lsb_max}",
"bne 0b",
lsb = in(reg) line_start_bytes,
lsb_max = in(reg) line_start_bytes.add(line_len_bytes),
palette = in(reg) core::ptr::addr_of!(VIDEO_PALETTE),
tmp = in(reg) 0,
tmp2 = in(reg) 1,
slbp = in(reg) scan_line_buffer_ptr,
);
}
} else {
// Single-width mode. This won't run fast enough on an RP2040, but no supported mode uses it.
// one RGB pixel per byte
for col in 0..line_len_bytes / 2 {
unsafe {
let pixel_pair = line_start_bytes.add(col).read();
let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
scan_line_buffer_ptr.write(pair);
let chunky_pixel_left = line_start_bytes.add(col * 2).read() as usize;
let rgb_left = palette_ptr.add(chunky_pixel_left).read();
let chunky_pixel_right = line_start_bytes.add((col * 2) + 1).read() as usize;
let rgb_right = palette_ptr.add(chunky_pixel_right).read();
scan_line_buffer_ptr.write(RGBPair::from_pixels(rgb_left, rgb_right));
scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
}
}
Expand Down Expand Up @@ -1057,14 +1168,6 @@ impl Chunky4ColourLookup {
}
}
}

/// Turn a pair of chunky4 pixels (in a `u8`), into a pair of RGB pixels.
#[inline]
fn lookup(&self, pixel_pair: u8) -> RGBPair {
let index = usize::from(pixel_pair);
let raw = self.entries[index].load(Ordering::Relaxed);
RGBPair(raw)
}
}

// -----------------------------------------------------------------------------
Expand Down Expand Up @@ -1983,6 +2086,12 @@ pub fn test_video_mode(mode: neotron_common_bios::video::Mode) -> bool {
| neotron_common_bios::video::Format::Chunky4,
true,
false,
) | (
neotron_common_bios::video::Timing::T640x480
| neotron_common_bios::video::Timing::T640x400,
neotron_common_bios::video::Format::Chunky8,
true,
false
)
)
}
Expand Down
Loading