diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 43eed64f7..9756825cf 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -128,7 +128,11 @@ jobs: - name: Install dependencies shell: bash run: | - sudo apt-get update && sudo apt-get install -y libgeos-dev + sudo apt-get update && sudo apt-get install -y \ + libgeos-dev \ + libgdal-dev \ + gdal-bin \ + pkg-config - name: Build if: matrix.name == 'build' diff --git a/Cargo.lock b/Cargo.lock index e0b82186a..93a695d37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -607,7 +607,7 @@ version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2b715a6010afb9e457ca2b7c9d2b9c344baa8baed7b38dc476034c171b32575" dependencies = [ - "bindgen", + "bindgen 0.72.1", "cc", "cmake", "dunce", @@ -943,6 +943,29 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools 0.11.0", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 2.0.106", + "which", +] + [[package]] name = "bindgen" version = "0.72.1" @@ -958,7 +981,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash", + "rustc-hash 2.1.1", "shlex", "syn 2.0.106", ] @@ -1452,7 +1475,7 @@ dependencies = [ "crossterm_winapi", "document-features", "parking_lot", - "rustix", + "rustix 1.1.2", "winapi", ] @@ -2465,7 +2488,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", - "rustix", + "rustix 1.1.2", "windows-sys 0.59.0", ] @@ -2642,6 +2665,34 @@ dependencies = [ "slab", ] +[[package]] +name = "gdal" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82ab834e8be6b54fee3d0141fce5e776ad405add1f9d0da054281926e0d35a9f" +dependencies = [ + "bitflags", + "chrono", + "gdal-sys", + "geo-types", + "libc", + "once_cell", + "semver", + "thiserror 1.0.69", +] + +[[package]] +name = "gdal-sys" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18ad5d608ee6726efcf6e1d91261eb6dec7da3ee7db6bda984cdfb8a7d65ebf9" +dependencies = [ + "bindgen 0.69.5", + "libc", + "pkg-config", + "semver", +] + [[package]] name = "generational-arena" version = "0.2.9" @@ -3382,6 +3433,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "lexical-core" version = "1.0.6" @@ -3526,6 +3589,12 @@ dependencies = [ "cc", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + [[package]] name = "linux-raw-sys" version = "0.11.0" @@ -4246,7 +4315,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.1.1", "rustls", "socket2", "thiserror 2.0.17", @@ -4266,7 +4335,7 @@ dependencies = [ "lru-slab", "rand 0.9.2", "ring", - "rustc-hash", + "rustc-hash 2.1.1", "rustls", "rustls-pki-types", "slab", @@ -4602,6 +4671,12 @@ version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.1" @@ -4617,6 +4692,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + [[package]] name = "rustix" version = "1.1.2" @@ -4626,7 +4714,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.11.0", "windows-sys 0.61.2", ] @@ -4962,6 +5050,26 @@ dependencies = [ "wkt 0.14.0", ] +[[package]] +name = "sedona-gdal" +version = "0.2.0" +dependencies = [ + "approx", + "arrow", + "arrow-array", + "arrow-schema", + "criterion", + "datafusion-common", + "datafusion-expr", + "gdal", + "gdal-sys", + "rstest", + "sedona-raster", + "sedona-schema", + "sedona-testing", + "tempfile", +] + [[package]] name = "sedona-geo" version = "0.2.0" @@ -5677,7 +5785,7 @@ dependencies = [ "fastrand", "getrandom 0.3.3", "once_cell", - "rustix", + "rustix 1.1.2", "windows-sys 0.61.2", ] @@ -6258,6 +6366,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix 0.38.44", +] + [[package]] name = "winapi" version = "0.3.9" @@ -6578,7 +6698,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix", + "rustix 1.1.2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 2f7e60571..daaefe444 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ members = [ "rust/sedona-datasource", "rust/sedona-expr", "rust/sedona-functions", + "rust/sedona-gdal", "rust/sedona-geo-generic-alg", "rust/sedona-geo-traits-ext", "rust/sedona-geo", @@ -87,7 +88,7 @@ datafusion-physical-expr = { version = "50.2.0" } datafusion-physical-plan = { version = "50.2.0" } dirs = "6.0.0" env_logger = "0.11" -fastrand = "2.0" +fastrand = "2.3" futures = { version = "0.3" } object_store = { version = "0.12.0", default-features = false } float_next_after = "1" @@ -96,6 +97,9 @@ mimalloc = { version = "0.1", default-features = false } libmimalloc-sys = { version = "0.1", default-features = false } once_cell = "1.20" +gdal = { version = "0.17", features = ["bindgen"] } +gdal-sys = { version = "0.10", features = ["bindgen"] } + geos = { version = "10.0.0", features = ["geo", "v3_10_0"] } geo-types = "0.7.17" diff --git a/rust/sedona-gdal/Cargo.toml b/rust/sedona-gdal/Cargo.toml new file mode 100644 index 000000000..6bc8240dc --- /dev/null +++ b/rust/sedona-gdal/Cargo.toml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "sedona-gdal" +version.workspace = true +homepage.workspace = true +repository.workspace = true +description.workspace = true +readme.workspace = true +edition.workspace = true +rust-version.workspace = true + +[lints.clippy] +result_large_err = "allow" + +[dev-dependencies] +approx = { workspace = true } +rstest = { workspace = true } +sedona-testing = { path = "../../rust/sedona-testing", features = ["criterion"] } +criterion = { workspace = true} +tempfile = { workspace = true } + +[dependencies] +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +datafusion-common = { workspace = true } +datafusion-expr = { workspace = true } +gdal = {workspace = true} +gdal-sys = {workspace = true} +sedona-raster = { path = "../sedona-raster" } +sedona-schema = { path = "../sedona-schema" } diff --git a/rust/sedona-gdal/src/dataset.rs b/rust/sedona-gdal/src/dataset.rs new file mode 100644 index 000000000..ebbeb9be9 --- /dev/null +++ b/rust/sedona-gdal/src/dataset.rs @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::ArrowError; +use gdal::raster::GdalDataType; +use gdal::Dataset; +use gdal::Metadata; +use sedona_schema::raster::BandDataType; + +/// Extract geotransform components from a GDAL dataset +/// Returns (upper_left_x, upper_left_y, scale_x, scale_y, skew_x, skew_y) +pub fn geotransform_components( + dataset: &Dataset, +) -> Result<(f64, f64, f64, f64, f64, f64), ArrowError> { + let geotransform = dataset + .geo_transform() + .map_err(|e| ArrowError::ParseError(format!("Failed to get geotransform: {e}")))?; + Ok(( + geotransform[0], // Upper-left X coordinate + geotransform[3], // Upper-left Y coordinate + geotransform[1], // scale_x (pixel width) + geotransform[5], // scale_y (pixel height, usually negative) + geotransform[2], // skew_x (X-direction skew) + geotransform[4], // skew_y (Y-direction skew) + )) +} + +/// Extract tile size from a GDAL dataset +pub fn tile_size(dataset: &Dataset) -> (Option, Option) { + let tile_width = match dataset.metadata_item("TILEWIDTH", "") { + Some(val) => val.parse::().ok(), + None => None, + }; + let tile_height = match dataset.metadata_item("TILEHEIGHT", "") { + Some(val) => val.parse::().ok(), + None => None, + }; + + (tile_width, tile_height) +} + +pub fn to_banddatatype(gdal_data_type: GdalDataType) -> Result { + match gdal_data_type { + GdalDataType::UInt8 => Ok(BandDataType::UInt8), + GdalDataType::UInt16 => Ok(BandDataType::UInt16), + GdalDataType::Int16 => Ok(BandDataType::Int16), + GdalDataType::UInt32 => Ok(BandDataType::UInt32), + GdalDataType::Int32 => Ok(BandDataType::Int32), + GdalDataType::Float32 => Ok(BandDataType::Float32), + GdalDataType::Float64 => Ok(BandDataType::Float64), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported GDAL data type: {:?}", + gdal_data_type + ))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use gdal::DriverManager; + + #[test] + fn test_geotransform_components() { + let driver = DriverManager::get_driver_by_name("MEM").unwrap(); + let mut dataset = driver + .create_with_band_type::("", 100, 100, 1) + .unwrap(); + + let (upper_left_x, upper_left_y, pixel_width, pixel_height, rotation_x, rotation_y) = + (10.0, 20.0, 1.5, -2.5, 0.1, 0.2); + + // Add some basic georeferencing + dataset + .set_geo_transform(&[ + upper_left_x, + pixel_width, + rotation_x, + upper_left_y, + rotation_y, + pixel_height, + ]) + .unwrap(); + + let (ulx, uly, sx, sy, rx, ry) = geotransform_components(&dataset).unwrap(); + assert_eq!(ulx, upper_left_x); + assert_eq!(uly, upper_left_y); + assert_eq!(sx, pixel_width); + assert_eq!(sy, pixel_height); + assert_eq!(rx, rotation_x); + assert_eq!(ry, rotation_y); + } + + #[test] + fn test_tile_size() -> Result<(), ArrowError> { + let driver = DriverManager::get_driver_by_name("MEM") + .map_err(|e| ArrowError::ParseError(format!("Failed to get MEM driver: {e}")))?; + let mut dataset = driver + .create_with_band_type::("", 256, 512, 1) + .map_err(|e| ArrowError::ParseError(format!("Failed to create dataset: {e}")))?; + + // Set real tile size metadata + dataset + .set_metadata_item("TILEWIDTH", "256", "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEWIDTH: {e}")))?; + dataset + .set_metadata_item("TILEHEIGHT", "512", "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEHEIGHT: {e}")))?; + + let (tile_width, tile_height) = tile_size(&dataset); + assert!(tile_width.is_some()); + assert_eq!(tile_width.unwrap(), 256); + assert_eq!(tile_height.unwrap(), 512); + Ok(()) + } + + #[test] + fn test_to_banddatatype() { + assert_eq!( + to_banddatatype(GdalDataType::UInt8).unwrap(), + BandDataType::UInt8 + ); + let result = to_banddatatype(GdalDataType::Unknown); + assert!(result.is_err()); + assert!(result + .err() + .unwrap() + .to_string() + .contains("Unsupported GDAL data type")); + } +} diff --git a/rust/sedona-gdal/src/lib.rs b/rust/sedona-gdal/src/lib.rs new file mode 100644 index 000000000..7e25c58d9 --- /dev/null +++ b/rust/sedona-gdal/src/lib.rs @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod dataset; +pub mod raster_io; diff --git a/rust/sedona-gdal/src/raster_io.rs b/rust/sedona-gdal/src/raster_io.rs new file mode 100644 index 000000000..4bf8d7294 --- /dev/null +++ b/rust/sedona-gdal/src/raster_io.rs @@ -0,0 +1,684 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::dataset::{geotransform_components, tile_size, to_banddatatype}; +use arrow::array::StructArray; +use arrow::error::ArrowError; +use gdal::raster::RasterBand; +use gdal::Dataset; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata}; +use sedona_raster::utils::{bytes_per_pixel, f64_to_bandtype_bytes}; +use sedona_schema::raster::{BandDataType, StorageType}; +use std::sync::Arc; + +// ============================================================================ +// Format-specific convenience wrappers +// ============================================================================ + +/// Reads a GeoTIFF file using GDAL and converts it into a StructArray of rasters. +/// +/// This is a convenience wrapper around [`read_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `filepath` - Path to the GeoTIFF file +/// * `tile_size_opt` - Optional tile size to override dataset metadata +pub fn read_geotiff( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result, ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + read_raster(filepath, tile_size_opt) +} + +/// Writes a tiled raster StructArray to a GeoTIFF file using GDAL. +/// +/// This is a convenience wrapper around [`write_raster`] for GeoTIFF files. +/// +/// # Arguments +/// * `raster_array` - The raster struct array to write +/// * `filepath` - Path to the output GeoTIFF file +pub fn write_geotiff(raster_array: &StructArray, filepath: &str) -> Result<(), ArrowError> { + // Check that the filepath has a GeoTIFF extension + let filepath_lower = filepath.to_lowercase(); + if !filepath_lower.ends_with(".tif") && !filepath_lower.ends_with(".tiff") { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected GeoTIFF file with .tif or .tiff extension, got: {}", + filepath + ))); + } + write_raster(raster_array, filepath, "GTiff") +} + +/// Reads a raster file using GDAL and converts it into a StructArray of rasters. +/// +/// Currently only supports reading rasters into InDb storage type. +/// OutDb storage types are not yet implemented. +/// +/// # Arguments +/// * `filepath` - Path to the raster file +/// * `tile_size_opt` - Optional tile size to override dataset metadata (uses dataset metadata if None) +pub fn read_raster( + filepath: &str, + tile_size_opt: Option<(usize, usize)>, +) -> Result, ArrowError> { + let dataset = Dataset::open(filepath).map_err(|err| ArrowError::ParseError(err.to_string()))?; + + // Extract geotransform components + let (origin_upperleft_x, origin_upperleft_y, scale_x, scale_y, skew_x, skew_y) = + geotransform_components(&dataset)?; + + let (raster_width, raster_height) = dataset.raster_size(); + + // Use provided tile dimensions or fall back to dataset metadata + // If neither is provided, default to full raster size + // In the future consider using an ideal tile size + let (tile_width, tile_height) = match tile_size_opt { + Some((w, h)) => (w, h), + _ => match tile_size(&dataset) { + (Some(w), Some(h)) => (w, h), + _ => (raster_width, raster_height), + }, + }; + + let x_tile_count = raster_width.div_ceil(tile_width); + let y_tile_count = raster_height.div_ceil(tile_height); + + let mut raster_builder = RasterBuilder::new(x_tile_count * y_tile_count); + let band_count = dataset.raster_count(); + + for tile_y in 0..y_tile_count { + for tile_x in 0..x_tile_count { + let x_offset = tile_x * tile_width; + let y_offset = tile_y * tile_height; + + // Calculate geographic coordinates for this tile + // using the geotransform from the original raster + let tile_upperleft_x = + origin_upperleft_x + (x_offset as f64) * scale_x + (y_offset as f64) * skew_x; + let tile_upperleft_y = + origin_upperleft_y + (x_offset as f64) * skew_y + (y_offset as f64) * scale_y; + + // Create raster metadata for this tile with actual geotransform values + let tile_metadata = RasterMetadata { + width: tile_width as u64, + height: tile_height as u64, + upperleft_x: tile_upperleft_x, + upperleft_y: tile_upperleft_y, + scale_x, + scale_y, + skew_x, + skew_y, + }; + + raster_builder.start_raster(&tile_metadata, None)?; + + for band_number in 1..=band_count { + let band: RasterBand = dataset.rasterband(band_number).unwrap(); + + // The band size is always the full raster size, not the tile size + // We read a specific window from the band + let actual_tile_width = tile_width.min(raster_width - x_offset); + let actual_tile_height = tile_height.min(raster_height - y_offset); + + let data_type = to_banddatatype(band.band_type())?; + let nodata_value = band + .no_data_value() + .map(|val| f64_to_bandtype_bytes(val, data_type)); + + let band_metadata = BandMetadata { + nodata_value, + storage_type: StorageType::InDb, + datatype: data_type, + outdb_url: None, + outdb_band_id: None, + }; + + raster_builder.start_band(band_metadata)?; + + // Reserve capacity and write band data directly from GDAL + let pixel_count = actual_tile_width * actual_tile_height; + let bytes_needed = pixel_count * bytes_per_pixel(data_type); + + // Pre-allocate a vector of the exact size needed + let mut band_data = vec![0u8; bytes_needed]; + + // Read directly into the pre-allocated buffer + read_band_data_into( + &band, + (x_offset as isize, y_offset as isize), + (actual_tile_width, actual_tile_height), + &data_type, + &mut band_data, + )?; + + // Write the band data + raster_builder.band_data_writer().append_value(&band_data); + + // Finalize the band + raster_builder.finish_band()?; + } + + // Finalize the raster + raster_builder.finish_raster()?; + } + } + + // Finalize the raster struct array + let raster_struct = raster_builder.finish()?; + Ok(Arc::new(raster_struct)) +} + +/// Helper function to read band data from GDAL directly into a pre-allocated byte buffer +/// Casts the buffer to the appropriate type and reads directly onto `output` +fn read_band_data_into( + band: &RasterBand, + window_origin: (isize, isize), + window_size: (usize, usize), + data_type: &BandDataType, + output: &mut [u8], +) -> Result<(), ArrowError> { + let pixel_count = window_size.0 * window_size.1; + + match data_type { + BandDataType::UInt8 => { + band.read_into_slice(window_origin, window_size, window_size, output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::UInt16 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut u16, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Int16 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut i16, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::UInt32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut u32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Int32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut i32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Float32 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut f32, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + BandDataType::Float64 => { + let typed_output = unsafe { + std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut f64, pixel_count) + }; + band.read_into_slice(window_origin, window_size, window_size, typed_output, None) + .map_err(|e| ArrowError::ParseError(format!("Failed to read band data: {e}")))?; + Ok(()) + } + } +} + +/// Write a tiled raster StructArray to a raster file using GDAL. +/// +/// This is a generic function that works with any GDAL-supported raster format. +/// Currently only supports writing rasters with InDb storage type. +/// OutDb storage types will return a NotYetImplemented error. +/// +/// # Arguments +/// * `raster_array` - The raster struct array to write +/// * `filepath` - Path to the output file +/// * `driver_name` - GDAL driver name (e.g., "GTiff", "Zarr") +fn write_raster( + raster_array: &StructArray, + filepath: &str, + driver_name: &str, +) -> Result<(), ArrowError> { + use gdal::{DriverManager, Metadata}; + use sedona_raster::array::RasterStructArray; + use sedona_raster::traits::RasterRef; + + let raster_struct_array = RasterStructArray::new(raster_array); + + if raster_struct_array.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Cannot write empty raster array".to_string(), + )); + } + + // Get the first raster to determine dimensions and band count + let first_raster = raster_struct_array.get(0)?; + let first_metadata = first_raster.metadata(); + let band_count = first_raster.bands().len(); + + if band_count == 0 { + return Err(ArrowError::InvalidArgumentError( + "Raster has no bands".to_string(), + )); + } + + let first_band = first_raster.bands().band(1)?; + let data_type = first_band.metadata().data_type(); + + // Calculate total raster dimensions by analyzing tile positions + let mut max_x: f64 = 0.0; + let mut max_y: f64 = 0.0; + let mut min_x: f64 = f64::MAX; + let mut min_y: f64 = f64::MAX; + + for i in 0..raster_struct_array.len() { + let raster = raster_struct_array.get(i)?; + let metadata = raster.metadata(); + + let tile_max_x = metadata.upper_left_x() + (metadata.width() as f64) * metadata.scale_x(); + let tile_max_y = metadata.upper_left_y() + (metadata.height() as f64) * metadata.scale_y(); + + max_x = max_x.max(tile_max_x); + max_y = max_y.max(tile_max_y); + min_x = min_x.min(metadata.upper_left_x()); + min_y = min_y.min(metadata.upper_left_y()); + } + + // Calculate total raster dimensions + let scale_x = first_metadata.scale_x(); + let scale_y = first_metadata.scale_y(); + let total_width = ((max_x - min_x) / scale_x).abs().round() as usize; + let total_height = ((max_y - min_y) / scale_y).abs().round() as usize; + + // Get GDAL driver by name + let driver = DriverManager::get_driver_by_name(driver_name).map_err(|e| { + ArrowError::ParseError(format!("Failed to get {} driver: {e}", driver_name)) + })?; + + // Create dataset based on data type + let mut dataset = match data_type { + BandDataType::UInt8 => { + driver.create_with_band_type::(filepath, total_width, total_height, band_count) + } + BandDataType::UInt16 => { + driver.create_with_band_type::(filepath, total_width, total_height, band_count) + } + BandDataType::Int16 => { + driver.create_with_band_type::(filepath, total_width, total_height, band_count) + } + BandDataType::UInt32 => { + driver.create_with_band_type::(filepath, total_width, total_height, band_count) + } + BandDataType::Int32 => { + driver.create_with_band_type::(filepath, total_width, total_height, band_count) + } + BandDataType::Float32 => { + driver.create_with_band_type::(filepath, total_width, total_height, band_count) + } + BandDataType::Float64 => { + driver.create_with_band_type::(filepath, total_width, total_height, band_count) + } + } + .map_err(|e| ArrowError::ParseError(format!("Failed to create GeoTIFF: {e}")))?; + + // Set geotransform + dataset + .set_geo_transform(&[ + min_x, + scale_x, + first_metadata.skew_x(), + min_y, + first_metadata.skew_y(), + scale_y, + ]) + .map_err(|e| ArrowError::ParseError(format!("Failed to set geotransform: {e}")))?; + + // Set tile size metadata if tiles are being used + let tile_width = first_metadata.width(); + let tile_height = first_metadata.height(); + dataset + .set_metadata_item("TILEWIDTH", &tile_width.to_string(), "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEWIDTH metadata: {e}")))?; + dataset + .set_metadata_item("TILEHEIGHT", &tile_height.to_string(), "") + .map_err(|e| ArrowError::ParseError(format!("Failed to set TILEHEIGHT metadata: {e}")))?; + + // Write each tile to the dataset + for i in 0..raster_struct_array.len() { + let raster = raster_struct_array.get(i)?; + let raster_metadata = raster.metadata(); + + // Calculate pixel offset for this tile + let x_offset = ((raster_metadata.upper_left_x() - min_x) / scale_x).round() as isize; + let y_offset = ((raster_metadata.upper_left_y() - min_y) / scale_y).round() as isize; + + let tile_width = raster_metadata.width() as usize; + let tile_height = raster_metadata.height() as usize; + + // Write each band + for band_index in 0..band_count { + let band = raster.bands().band(band_index + 1)?; + + // Check that storage type is InDb + if band.metadata().storage_type() != StorageType::InDb { + return Err(ArrowError::NotYetImplemented( + format!("Writing rasters with storage type {:?} is not yet implemented. Only InDb storage is currently supported.", + band.metadata().storage_type()) + )); + } + + let mut gdal_band = dataset.rasterband(band_index + 1).map_err(|e| { + ArrowError::ParseError(format!("Failed to get GDAL band {}: {e}", band_index + 1)) + })?; + + let band_data = band.data(); + let band_datatype = band.metadata().data_type(); + + // Write the band data to the appropriate location in the dataset + // Convert the byte slice to the appropriate type for GDAL + write_band_data( + &mut gdal_band, + (x_offset, y_offset), + (tile_width, tile_height), + band_data, + band_datatype, + )?; + + // Set nodata value if present + // Note: Some drivers (e.g., Zarr) don't support nodata values + if let Some(nodata_bytes) = band.metadata().nodata_value() { + if let Some(nodata_f64) = bytes_to_f64(nodata_bytes, band.metadata().data_type()) { + let _ = gdal_band.set_no_data_value(Some(nodata_f64)); + } + } + } + } + + // Flush the dataset + dataset.flush_cache().map_err(|e| { + ArrowError::ParseError(format!("Failed to flush cache when writing GeoTIFF: {e}")) + })?; + + Ok(()) +} + +/// Helper function to write band data to GDAL, converting from bytes to the appropriate type +fn write_band_data( + gdal_band: &mut RasterBand, + window_origin: (isize, isize), + window_size: (usize, usize), + band_data: &[u8], + data_type: BandDataType, +) -> Result<(), ArrowError> { + use gdal::raster::Buffer; + + let (width, height) = window_size; + let pixel_count = width * height; + + match data_type { + BandDataType::UInt8 => { + let data: Vec = band_data.to_vec(); + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::UInt16 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(2) { + data.push(u16::from_ne_bytes([chunk[0], chunk[1]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Int16 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(2) { + data.push(i16::from_ne_bytes([chunk[0], chunk[1]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::UInt32 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(4) { + data.push(u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Int32 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(4) { + data.push(i32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Float32 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(4) { + data.push(f32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + BandDataType::Float64 => { + let mut data = Vec::with_capacity(pixel_count); + for chunk in band_data.chunks_exact(8) { + data.push(f64::from_ne_bytes([ + chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7], + ])); + } + let mut buffer = Buffer::new(window_size, data); + gdal_band.write(window_origin, window_size, &mut buffer) + } + } + .map_err(|e| ArrowError::ParseError(format!("Failed to write band data: {e}"))) +} + +/// Convert nodata value bytes back to f64 +fn bytes_to_f64(bytes: &[u8], data_type: BandDataType) -> Option { + let expected_bytes = bytes_per_pixel(data_type); + if bytes.len() < expected_bytes { + return None; + } + match data_type { + BandDataType::UInt8 => Some(bytes[0] as f64), + BandDataType::UInt16 => Some(u16::from_ne_bytes([bytes[0], bytes[1]]) as f64), + BandDataType::Int16 => Some(i16::from_ne_bytes([bytes[0], bytes[1]]) as f64), + BandDataType::UInt32 => { + Some(u32::from_ne_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as f64) + } + BandDataType::Int32 => { + Some(i32::from_ne_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as f64) + } + BandDataType::Float32 => { + Some(f32::from_ne_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as f64) + } + BandDataType::Float64 => Some(f64::from_ne_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ])), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_abs_diff_eq; + use arrow::array::Array; + use rstest::rstest; + use sedona_raster::array::RasterStructArray; + use sedona_raster::traits::RasterRef; + use sedona_schema::raster::BandDataType; + use sedona_testing::rasters::{assert_raster_arrays_equal, generate_tiled_rasters}; + use tempfile::tempdir; + + #[rstest] + fn test_read_write_raster( + #[values( + BandDataType::UInt8, + BandDataType::UInt16, + BandDataType::Int16, + BandDataType::UInt32, + BandDataType::Int32, + BandDataType::Float32, + BandDataType::Float64 + )] + data_type: BandDataType, + ) { + let tile_count = (4, 4); + let tile_size = (16, 8); + let raster_struct = + generate_tiled_rasters(tile_size, tile_count, data_type, Some(42)).unwrap(); + + // Write the raster array to a temporary GeoTIFF file + let temp_dir = tempdir().unwrap(); + let filepath = temp_dir + .path() + .join(format!("test_raster_output_{:?}.tif", data_type)); + let filepath_str = filepath.as_os_str().to_str().unwrap(); + write_geotiff(&raster_struct, filepath_str).unwrap(); + + // Read the rasters back in from the GeoTIFF using the tile metadata + let read_raster_struct = read_geotiff(filepath_str, None).unwrap(); + assert_eq!(raster_struct.len(), read_raster_struct.len()); + + // Compare the original and read rasters for equality + let raster_array = RasterStructArray::new(&raster_struct); + let read_raster_array = RasterStructArray::new(&read_raster_struct); + assert_raster_arrays_equal(&raster_array, &read_raster_array); + + // Re-Read with new tiling parameters + let (new_tile_width, new_tile_height) = (4, 2); + let read_raster_array_tiled = + read_raster(filepath_str, Some((new_tile_width, new_tile_height))).unwrap(); + let raster_retiled_array = RasterStructArray::new(&read_raster_array_tiled); + + let expected_new_length = (tile_count.0 * tile_size.0 / new_tile_width) + * (tile_count.1 * tile_size.1 / new_tile_height); + // Validate the new tiling + assert_eq!(expected_new_length, raster_retiled_array.len()); + let raster = raster_retiled_array.get(0).unwrap(); + let metadata = raster.metadata(); + assert_eq!(metadata.width(), new_tile_width as u64); + assert_eq!(metadata.height(), new_tile_height as u64); + + // Re-Write the re-tiled raster to a new GeoTIFF + let retiled_filepath = temp_dir + .path() + .join(format!("test_raster_retiled_output_{:?}.tif", data_type)); + let retiled_filepath_str = retiled_filepath.as_os_str().to_str().unwrap(); + write_geotiff(&read_raster_array_tiled, retiled_filepath_str).unwrap(); + + // Re-Read with original tiling parameters + let read_original_tiling_raster_array = + read_geotiff(retiled_filepath_str, Some(tile_size)).unwrap(); + + // Validate that we get back the original raster array + assert_raster_arrays_equal( + &raster_array, + &RasterStructArray::new(&read_original_tiling_raster_array), + ); + + // Clean up + drop(filepath); + drop(retiled_filepath); + temp_dir.close().unwrap(); + } + + #[test] + fn test_filepath_validation() { + let err = read_geotiff("test.zarr", None).unwrap_err(); + assert!(err.to_string().contains("Expected GeoTIFF")); + assert!(err.to_string().contains(".tif or .tiff")); + + let raster_struct = + generate_tiled_rasters((16, 16), (8, 8), BandDataType::UInt8, Some(42)).unwrap(); + let err = write_geotiff(&raster_struct, "test.zarr").unwrap_err(); + assert!(err.to_string().contains("Expected GeoTIFF")); + } + + #[test] + fn test_bytes_to_f64() { + // UInt8 + let value: u8 = 255; + let bytes = value.to_le_bytes(); + assert_eq!(bytes_to_f64(&bytes, BandDataType::UInt8), Some(255.0)); + + // UInt16 + let value: u16 = 32768; + let bytes = value.to_le_bytes(); + assert_eq!(bytes_to_f64(&bytes, BandDataType::UInt16), Some(32768.0)); + + // Int16 + let value: i16 = -32768; + let bytes = value.to_le_bytes(); + assert_eq!(bytes_to_f64(&bytes, BandDataType::Int16), Some(-32768.0)); + + // UInt32 + let value: u32 = 2147483648; + let bytes = value.to_le_bytes(); + assert_eq!( + bytes_to_f64(&bytes, BandDataType::UInt32), + Some(2147483648.0) + ); + + // Int32 + let value: i32 = -2147483648; + let bytes = value.to_le_bytes(); + assert_eq!( + bytes_to_f64(&bytes, BandDataType::Int32), + Some(-2147483648.0) + ); + + // Float32 + let value: f32 = 256.7; + let bytes = value.to_le_bytes(); + assert_abs_diff_eq!( + bytes_to_f64(&bytes, BandDataType::Float32).unwrap(), + 256.7, + epsilon = 0.0001 // f32 precision limit + ); + + // Float64 + let value: f64 = -2147483648.3; + let bytes = value.to_le_bytes(); + assert_eq!( + bytes_to_f64(&bytes, BandDataType::Float64), + Some(-2147483648.3) + ); + } +} diff --git a/rust/sedona-raster/src/builder.rs b/rust/sedona-raster/src/builder.rs index 891a6ff07..5b75b2962 100644 --- a/rust/sedona-raster/src/builder.rs +++ b/rust/sedona-raster/src/builder.rs @@ -647,7 +647,7 @@ mod tests { let band_metadata = BandMetadata { nodata_value: None, storage_type: StorageType::InDb, - datatype: expected_data_type.clone(), + datatype: expected_data_type, outdb_url: None, outdb_band_id: None, }; diff --git a/rust/sedona-raster/src/lib.rs b/rust/sedona-raster/src/lib.rs index de2e9b2e4..94aa15d50 100644 --- a/rust/sedona-raster/src/lib.rs +++ b/rust/sedona-raster/src/lib.rs @@ -19,3 +19,4 @@ pub mod affine_transformation; pub mod array; pub mod builder; pub mod traits; +pub mod utils; diff --git a/rust/sedona-raster/src/utils.rs b/rust/sedona-raster/src/utils.rs new file mode 100644 index 000000000..7f3707cb0 --- /dev/null +++ b/rust/sedona-raster/src/utils.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use sedona_schema::raster::BandDataType; + +/// Get the number of bytes per pixel for a given BandDataType +pub fn bytes_per_pixel(datatype: BandDataType) -> usize { + match datatype { + BandDataType::UInt8 => 1, + BandDataType::Int16 | BandDataType::UInt16 => 2, + BandDataType::Int32 | BandDataType::UInt32 | BandDataType::Float32 => 4, + BandDataType::Float64 => 8, + } +} + +/// Convert an f64 nodata value to bytes for the given BandDataType +pub fn f64_to_bandtype_bytes(value: f64, datatype: BandDataType) -> Vec { + match datatype { + BandDataType::UInt8 => vec![value as u8], + BandDataType::UInt16 => (value as u16).to_le_bytes().to_vec(), + BandDataType::Int16 => (value as i16).to_le_bytes().to_vec(), + BandDataType::UInt32 => (value as u32).to_le_bytes().to_vec(), + BandDataType::Int32 => (value as i32).to_le_bytes().to_vec(), + BandDataType::Float32 => (value as f32).to_le_bytes().to_vec(), + BandDataType::Float64 => value.to_le_bytes().to_vec(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_bytes_per_pixel() { + assert_eq!(bytes_per_pixel(BandDataType::UInt8), 1); + assert_eq!(bytes_per_pixel(BandDataType::Int16), 2); + assert_eq!(bytes_per_pixel(BandDataType::UInt16), 2); + assert_eq!(bytes_per_pixel(BandDataType::Int32), 4); + assert_eq!(bytes_per_pixel(BandDataType::UInt32), 4); + assert_eq!(bytes_per_pixel(BandDataType::Float32), 4); + assert_eq!(bytes_per_pixel(BandDataType::Float64), 8); + } + + #[test] + fn test_f64_to_bandtype_bytes() { + assert_eq!( + f64_to_bandtype_bytes(255.0, BandDataType::UInt8), + vec![255u8] + ); + assert_eq!( + f64_to_bandtype_bytes(65535.0, BandDataType::UInt16), + 65535u16.to_le_bytes().to_vec() + ); + assert_eq!( + f64_to_bandtype_bytes(-32768.0, BandDataType::Int16), + (-32768i16).to_le_bytes().to_vec() + ); + assert_eq!( + f64_to_bandtype_bytes(4294967295.0, BandDataType::UInt32), + 4294967295u32.to_le_bytes().to_vec() + ); + assert_eq!( + f64_to_bandtype_bytes(-2147483648.0, BandDataType::Int32), + (-2147483648i32).to_le_bytes().to_vec() + ); + assert_eq!( + f64_to_bandtype_bytes(std::f32::consts::PI.into(), BandDataType::Float32), + (std::f32::consts::PI).to_le_bytes().to_vec() + ); + assert_eq!( + f64_to_bandtype_bytes(std::f64::consts::E, BandDataType::Float64), + (std::f64::consts::E).to_le_bytes().to_vec() + ); + } +} diff --git a/rust/sedona-schema/src/raster.rs b/rust/sedona-schema/src/raster.rs index 1027e53fd..003056c16 100644 --- a/rust/sedona-schema/src/raster.rs +++ b/rust/sedona-schema/src/raster.rs @@ -92,7 +92,7 @@ impl RasterSchema { /// In future versions, consider support for complex types used in /// radar and other wave-based data. #[repr(u16)] -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum BandDataType { UInt8 = 0, UInt16 = 1, diff --git a/rust/sedona-testing/src/rasters.rs b/rust/sedona-testing/src/rasters.rs index 69d83ca15..8d13881c5 100644 --- a/rust/sedona-testing/src/rasters.rs +++ b/rust/sedona-testing/src/rasters.rs @@ -115,7 +115,7 @@ pub fn generate_tiled_rasters( let band_metadata = BandMetadata { nodata_value: nodata_value.clone(), storage_type: StorageType::InDb, - datatype: data_type.clone(), + datatype: data_type, outdb_url: None, outdb_band_id: None, };