Skip to content

Commit 764d547

Browse files
authored
speedup date_trunc (~7x faster) in some cases (#16859)
1 parent c3ec964 commit 764d547

File tree

2 files changed

+89
-1
lines changed

2 files changed

+89
-1
lines changed

datafusion/functions/src/datetime/date_trunc.rs

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use arrow::array::types::{
2828
ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType,
2929
TimestampNanosecondType, TimestampSecondType,
3030
};
31-
use arrow::array::{Array, PrimitiveArray};
31+
use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray};
3232
use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View};
3333
use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second};
3434
use datafusion_common::cast::as_primitive_array;
@@ -60,6 +60,8 @@ use chrono::{
6060
- hour / HOUR
6161
- minute / MINUTE
6262
- second / SECOND
63+
- millisecond / MILLISECOND
64+
- microsecond / MICROSECOND
6365
"#
6466
),
6567
argument(
@@ -185,6 +187,26 @@ impl ScalarUDFImpl for DateTruncFunc {
185187
) -> Result<ColumnarValue> {
186188
let parsed_tz = parse_tz(tz_opt)?;
187189
let array = as_primitive_array::<T>(array)?;
190+
191+
// fast path for fine granularities
192+
if matches!(
193+
granularity.as_str(),
194+
// For morden timezones, it's correct to truncate "minute" in this way.
195+
// Both datafusion and arrow are ignoring historical timezone's non-minute granularity
196+
// bias (e.g., Asia/Kathmandu before 1919 is UTC+05:41:16).
197+
"second" | "minute" | "millisecond" | "microsecond"
198+
) ||
199+
// In UTC, "hour" and "day" have uniform durations and can be truncated with simple arithmetic
200+
(parsed_tz.is_none() && matches!(granularity.as_str(), "hour" | "day"))
201+
{
202+
let result = general_date_trunc_array_fine_granularity(
203+
T::UNIT,
204+
array,
205+
granularity.as_str(),
206+
)?;
207+
return Ok(ColumnarValue::Array(result));
208+
}
209+
188210
let array: PrimitiveArray<T> = array
189211
.try_unary(|x| {
190212
general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str())
@@ -423,6 +445,55 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option<Tz>) -> Result<i6
423445
Ok(value.unwrap())
424446
}
425447

448+
/// Fast path for fine granularities (hour and smaller) that can be handled
449+
/// with simple arithmetic operations without calendar complexity.
450+
///
451+
/// This function is timezone-agnostic and should only be used when:
452+
/// - No timezone is specified in the input, OR
453+
/// - The granularity is less than hour as hour can be affected by DST transitions in some cases
454+
fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>(
455+
tu: TimeUnit,
456+
array: &PrimitiveArray<T>,
457+
granularity: &str,
458+
) -> Result<ArrayRef> {
459+
let unit = match (tu, granularity) {
460+
(Second, "minute") => Some(Int64Array::new_scalar(60)),
461+
(Second, "hour") => Some(Int64Array::new_scalar(3600)),
462+
(Second, "day") => Some(Int64Array::new_scalar(86400)),
463+
464+
(Millisecond, "second") => Some(Int64Array::new_scalar(1_000)),
465+
(Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)),
466+
(Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)),
467+
(Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)),
468+
469+
(Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)),
470+
(Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)),
471+
(Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)),
472+
(Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)),
473+
(Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)),
474+
475+
(Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)),
476+
(Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)),
477+
(Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)),
478+
(Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)),
479+
(Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)),
480+
(Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)),
481+
_ => None,
482+
};
483+
484+
if let Some(unit) = unit {
485+
let original_type = array.data_type();
486+
let array = arrow::compute::cast(array, &DataType::Int64)?;
487+
let array = arrow::compute::kernels::numeric::div(&array, &unit)?;
488+
let array = arrow::compute::kernels::numeric::mul(&array, &unit)?;
489+
let array = arrow::compute::cast(&array, original_type)?;
490+
Ok(array)
491+
} else {
492+
// truncate to the same or smaller unit
493+
Ok(Arc::new(array.clone()))
494+
}
495+
}
496+
426497
// truncates a single value with the given timeunit to the specified granularity
427498
fn general_date_trunc(
428499
tu: TimeUnit,
@@ -884,6 +955,21 @@ mod tests {
884955
"2018-11-04T02:00:00-02",
885956
],
886957
),
958+
(
959+
vec![
960+
"2024-10-26T23:30:00Z",
961+
"2024-10-27T00:30:00Z",
962+
"2024-10-27T01:30:00Z",
963+
"2024-10-27T02:30:00Z",
964+
],
965+
Some("Asia/Kathmandu".into()), // UTC+5:45
966+
vec![
967+
"2024-10-27T05:00:00+05:45",
968+
"2024-10-27T06:00:00+05:45",
969+
"2024-10-27T07:00:00+05:45",
970+
"2024-10-27T08:00:00+05:45",
971+
],
972+
),
887973
];
888974

889975
cases.iter().for_each(|(original, tz_opt, expected)| {

docs/source/user-guide/sql/scalar_functions.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2150,6 +2150,8 @@ date_trunc(precision, expression)
21502150
- hour / HOUR
21512151
- minute / MINUTE
21522152
- second / SECOND
2153+
- millisecond / MILLISECOND
2154+
- microsecond / MICROSECOND
21532155

21542156
- **expression**: Time expression to operate on. Can be a constant, column, or function.
21552157

0 commit comments

Comments
 (0)