summaryrefslogtreecommitdiffstats
path: root/library/core/src/slice/sort.rs
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--library/core/src/slice/sort.rs544
1 files changed, 540 insertions, 4 deletions
diff --git a/library/core/src/slice/sort.rs b/library/core/src/slice/sort.rs
index 87f77b7f2..2181f9a81 100644
--- a/library/core/src/slice/sort.rs
+++ b/library/core/src/slice/sort.rs
@@ -3,8 +3,11 @@
//! This module contains a sorting algorithm based on Orson Peters' pattern-defeating quicksort,
//! published at: <https://github.com/orlp/pdqsort>
//!
-//! Unstable sorting is compatible with libcore because it doesn't allocate memory, unlike our
+//! Unstable sorting is compatible with core because it doesn't allocate memory, unlike our
//! stable sorting implementation.
+//!
+//! In addition it also contains the core logic of the stable sort used by `slice::sort` based on
+//! TimSort.
use crate::cmp;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
@@ -18,9 +21,9 @@ struct CopyOnDrop<T> {
impl<T> Drop for CopyOnDrop<T> {
fn drop(&mut self) {
- // SAFETY: This is a helper class.
- // Please refer to its usage for correctness.
- // Namely, one must be sure that `src` and `dst` does not overlap as required by `ptr::copy_nonoverlapping`.
+ // SAFETY: This is a helper class.
+ // Please refer to its usage for correctness.
+ // Namely, one must be sure that `src` and `dst` does not overlap as required by `ptr::copy_nonoverlapping`.
unsafe {
ptr::copy_nonoverlapping(self.src, self.dest, 1);
}
@@ -831,6 +834,15 @@ fn partition_at_index_loop<'a, T, F>(
) where
F: FnMut(&T, &T) -> bool,
{
+ // Limit the amount of iterations and fall back to heapsort, similarly to `slice::sort_unstable`.
+ // This lowers the worst case running time from O(n^2) to O(n log n).
+ // FIXME: Investigate whether it would be better to use something like Median of Medians
+ // or Fast Deterministic Selection to guarantee O(n) worst case.
+ let mut limit = usize::BITS - v.len().leading_zeros();
+
+ // True if the last partitioning was reasonably balanced.
+ let mut was_balanced = true;
+
loop {
// For slices of up to this length it's probably faster to simply sort them.
const MAX_INSERTION: usize = 10;
@@ -839,6 +851,18 @@ fn partition_at_index_loop<'a, T, F>(
return;
}
+ if limit == 0 {
+ heapsort(v, is_less);
+ return;
+ }
+
+ // If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
+ // some elements around. Hopefully we'll choose a better pivot this time.
+ if !was_balanced {
+ break_patterns(v);
+ limit -= 1;
+ }
+
// Choose a pivot
let (pivot, _) = choose_pivot(v, is_less);
@@ -863,6 +887,7 @@ fn partition_at_index_loop<'a, T, F>(
}
let (mid, _) = partition(v, pivot, is_less);
+ was_balanced = cmp::min(mid, v.len() - mid) >= v.len() / 8;
// Split the slice into `left`, `pivot`, and `right`.
let (left, right) = v.split_at_mut(mid);
@@ -883,6 +908,7 @@ fn partition_at_index_loop<'a, T, F>(
}
}
+/// Reorder the slice such that the element at `index` is at its final sorted position.
pub fn partition_at_index<T, F>(
v: &mut [T],
index: usize,
@@ -927,3 +953,513 @@ where
let pivot = &mut pivot[0];
(left, pivot, right)
}
+
+/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
+///
+/// This is the integral subroutine of insertion sort.
+fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
+where
+ F: FnMut(&T, &T) -> bool,
+{
+ if v.len() >= 2 && is_less(&v[1], &v[0]) {
+ // SAFETY: Copy tmp back even if panic, and ensure unique observation.
+ unsafe {
+ // There are three ways to implement insertion here:
+ //
+ // 1. Swap adjacent elements until the first one gets to its final destination.
+ // However, this way we copy data around more than is necessary. If elements are big
+ // structures (costly to copy), this method will be slow.
+ //
+ // 2. Iterate until the right place for the first element is found. Then shift the
+ // elements succeeding it to make room for it and finally place it into the
+ // remaining hole. This is a good method.
+ //
+ // 3. Copy the first element into a temporary variable. Iterate until the right place
+ // for it is found. As we go along, copy every traversed element into the slot
+ // preceding it. Finally, copy data from the temporary variable into the remaining
+ // hole. This method is very good. Benchmarks demonstrated slightly better
+ // performance than with the 2nd method.
+ //
+ // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
+ let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
+
+ // Intermediate state of the insertion process is always tracked by `hole`, which
+ // serves two purposes:
+ // 1. Protects integrity of `v` from panics in `is_less`.
+ // 2. Fills the remaining hole in `v` in the end.
+ //
+ // Panic safety:
+ //
+ // If `is_less` panics at any point during the process, `hole` will get dropped and
+ // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
+ // initially held exactly once.
+ let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
+ ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
+
+ for i in 2..v.len() {
+ if !is_less(&v[i], &*tmp) {
+ break;
+ }
+ ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
+ hole.dest = &mut v[i];
+ }
+ // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
+ }
+ }
+
+ // When dropped, copies from `src` into `dest`.
+ struct InsertionHole<T> {
+ src: *const T,
+ dest: *mut T,
+ }
+
+ impl<T> Drop for InsertionHole<T> {
+ fn drop(&mut self) {
+ // SAFETY: The caller must ensure that src and dest are correctly set.
+ unsafe {
+ ptr::copy_nonoverlapping(self.src, self.dest, 1);
+ }
+ }
+ }
+}
+
+/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
+/// stores the result into `v[..]`.
+///
+/// # Safety
+///
+/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
+/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
+unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
+where
+ F: FnMut(&T, &T) -> bool,
+{
+ let len = v.len();
+ let v = v.as_mut_ptr();
+
+ // SAFETY: mid and len must be in-bounds of v.
+ let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
+
+ // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
+ // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
+ // copying the lesser (or greater) one into `v`.
+ //
+ // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
+ // consumed first, then we must copy whatever is left of the shorter run into the remaining
+ // hole in `v`.
+ //
+ // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
+ // 1. Protects integrity of `v` from panics in `is_less`.
+ // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
+ //
+ // Panic safety:
+ //
+ // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
+ // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
+ // object it initially held exactly once.
+ let mut hole;
+
+ if mid <= len - mid {
+ // The left run is shorter.
+
+ // SAFETY: buf must have enough capacity for `v[..mid]`.
+ unsafe {
+ ptr::copy_nonoverlapping(v, buf, mid);
+ hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
+ }
+
+ // Initially, these pointers point to the beginnings of their arrays.
+ let left = &mut hole.start;
+ let mut right = v_mid;
+ let out = &mut hole.dest;
+
+ while *left < hole.end && right < v_end {
+ // Consume the lesser side.
+ // If equal, prefer the left run to maintain stability.
+
+ // SAFETY: left and right must be valid and part of v same for out.
+ unsafe {
+ let to_copy = if is_less(&*right, &**left) {
+ get_and_increment(&mut right)
+ } else {
+ get_and_increment(left)
+ };
+ ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
+ }
+ }
+ } else {
+ // The right run is shorter.
+
+ // SAFETY: buf must have enough capacity for `v[mid..]`.
+ unsafe {
+ ptr::copy_nonoverlapping(v_mid, buf, len - mid);
+ hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
+ }
+
+ // Initially, these pointers point past the ends of their arrays.
+ let left = &mut hole.dest;
+ let right = &mut hole.end;
+ let mut out = v_end;
+
+ while v < *left && buf < *right {
+ // Consume the greater side.
+ // If equal, prefer the right run to maintain stability.
+
+ // SAFETY: left and right must be valid and part of v same for out.
+ unsafe {
+ let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) {
+ decrement_and_get(left)
+ } else {
+ decrement_and_get(right)
+ };
+ ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
+ }
+ }
+ }
+ // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
+ // it will now be copied into the hole in `v`.
+
+ unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
+ let old = *ptr;
+
+ // SAFETY: ptr.add(1) must still be a valid pointer and part of `v`.
+ *ptr = unsafe { ptr.add(1) };
+ old
+ }
+
+ unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
+ // SAFETY: ptr.sub(1) must still be a valid pointer and part of `v`.
+ *ptr = unsafe { ptr.sub(1) };
+ *ptr
+ }
+
+ // When dropped, copies the range `start..end` into `dest..`.
+ struct MergeHole<T> {
+ start: *mut T,
+ end: *mut T,
+ dest: *mut T,
+ }
+
+ impl<T> Drop for MergeHole<T> {
+ fn drop(&mut self) {
+ // SAFETY: `T` is not a zero-sized type, and these are pointers into a slice's elements.
+ unsafe {
+ let len = self.end.sub_ptr(self.start);
+ ptr::copy_nonoverlapping(self.start, self.dest, len);
+ }
+ }
+ }
+}
+
+/// This merge sort borrows some (but not all) ideas from TimSort, which used to be described in
+/// detail [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). However Python
+/// has switched to a Powersort based implementation.
+///
+/// The algorithm identifies strictly descending and non-descending subsequences, which are called
+/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
+/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
+/// satisfied:
+///
+/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
+/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
+///
+/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
+pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
+ v: &mut [T],
+ is_less: &mut CmpF,
+ elem_alloc_fn: ElemAllocF,
+ elem_dealloc_fn: ElemDeallocF,
+ run_alloc_fn: RunAllocF,
+ run_dealloc_fn: RunDeallocF,
+) where
+ CmpF: FnMut(&T, &T) -> bool,
+ ElemAllocF: Fn(usize) -> *mut T,
+ ElemDeallocF: Fn(*mut T, usize),
+ RunAllocF: Fn(usize) -> *mut TimSortRun,
+ RunDeallocF: Fn(*mut TimSortRun, usize),
+{
+ // Slices of up to this length get sorted using insertion sort.
+ const MAX_INSERTION: usize = 20;
+ // Very short runs are extended using insertion sort to span at least this many elements.
+ const MIN_RUN: usize = 10;
+
+ // The caller should have already checked that.
+ debug_assert!(!T::IS_ZST);
+
+ let len = v.len();
+
+ // Short arrays get sorted in-place via insertion sort to avoid allocations.
+ if len <= MAX_INSERTION {
+ if len >= 2 {
+ for i in (0..len - 1).rev() {
+ insert_head(&mut v[i..], is_less);
+ }
+ }
+ return;
+ }
+
+ // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
+ // shallow copies of the contents of `v` without risking the dtors running on copies if
+ // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
+ // which will always have length at most `len / 2`.
+ let buf = BufGuard::new(len / 2, elem_alloc_fn, elem_dealloc_fn);
+ let buf_ptr = buf.buf_ptr;
+
+ let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);
+
+ // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
+ // strange decision, but consider the fact that merges more often go in the opposite direction
+ // (forwards). According to benchmarks, merging forwards is slightly faster than merging
+ // backwards. To conclude, identifying runs by traversing backwards improves performance.
+ let mut end = len;
+ while end > 0 {
+ // Find the next natural run, and reverse it if it's strictly descending.
+ let mut start = end - 1;
+ if start > 0 {
+ start -= 1;
+
+ // SAFETY: The v.get_unchecked must be fed with correct inbound indicies.
+ unsafe {
+ if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
+ while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
+ start -= 1;
+ }
+ v[start..end].reverse();
+ } else {
+ while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
+ {
+ start -= 1;
+ }
+ }
+ }
+ }
+
+ // Insert some more elements into the run if it's too short. Insertion sort is faster than
+ // merge sort on short sequences, so this significantly improves performance.
+ while start > 0 && end - start < MIN_RUN {
+ start -= 1;
+ insert_head(&mut v[start..end], is_less);
+ }
+
+ // Push this run onto the stack.
+ runs.push(TimSortRun { start, len: end - start });
+ end = start;
+
+ // Merge some pairs of adjacent runs to satisfy the invariants.
+ while let Some(r) = collapse(runs.as_slice()) {
+ let left = runs[r + 1];
+ let right = runs[r];
+ // SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and
+ // neither side may be on length 0.
+ unsafe {
+ merge(&mut v[left.start..right.start + right.len], left.len, buf_ptr, is_less);
+ }
+ runs[r] = TimSortRun { start: left.start, len: left.len + right.len };
+ runs.remove(r + 1);
+ }
+ }
+
+ // Finally, exactly one run must remain in the stack.
+ debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
+
+ // Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
+ // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
+ // algorithm should continue building a new run instead, `None` is returned.
+ //
+ // TimSort is infamous for its buggy implementations, as described here:
+ // http://envisage-project.eu/timsort-specification-and-verification/
+ //
+ // The gist of the story is: we must enforce the invariants on the top four runs on the stack.
+ // Enforcing them on just top three is not sufficient to ensure that the invariants will still
+ // hold for *all* runs in the stack.
+ //
+ // This function correctly checks invariants for the top four runs. Additionally, if the top
+ // run starts at index 0, it will always demand a merge operation until the stack is fully
+ // collapsed, in order to complete the sort.
+ #[inline]
+ fn collapse(runs: &[TimSortRun]) -> Option<usize> {
+ let n = runs.len();
+ if n >= 2
+ && (runs[n - 1].start == 0
+ || runs[n - 2].len <= runs[n - 1].len
+ || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
+ || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
+ {
+ if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
+ } else {
+ None
+ }
+ }
+
+ // Extremely basic versions of Vec.
+ // Their use is super limited and by having the code here, it allows reuse between the sort
+ // implementations.
+ struct BufGuard<T, ElemDeallocF>
+ where
+ ElemDeallocF: Fn(*mut T, usize),
+ {
+ buf_ptr: *mut T,
+ capacity: usize,
+ elem_dealloc_fn: ElemDeallocF,
+ }
+
+ impl<T, ElemDeallocF> BufGuard<T, ElemDeallocF>
+ where
+ ElemDeallocF: Fn(*mut T, usize),
+ {
+ fn new<ElemAllocF>(
+ len: usize,
+ elem_alloc_fn: ElemAllocF,
+ elem_dealloc_fn: ElemDeallocF,
+ ) -> Self
+ where
+ ElemAllocF: Fn(usize) -> *mut T,
+ {
+ Self { buf_ptr: elem_alloc_fn(len), capacity: len, elem_dealloc_fn }
+ }
+ }
+
+ impl<T, ElemDeallocF> Drop for BufGuard<T, ElemDeallocF>
+ where
+ ElemDeallocF: Fn(*mut T, usize),
+ {
+ fn drop(&mut self) {
+ (self.elem_dealloc_fn)(self.buf_ptr, self.capacity);
+ }
+ }
+
+ struct RunVec<RunAllocF, RunDeallocF>
+ where
+ RunAllocF: Fn(usize) -> *mut TimSortRun,
+ RunDeallocF: Fn(*mut TimSortRun, usize),
+ {
+ buf_ptr: *mut TimSortRun,
+ capacity: usize,
+ len: usize,
+ run_alloc_fn: RunAllocF,
+ run_dealloc_fn: RunDeallocF,
+ }
+
+ impl<RunAllocF, RunDeallocF> RunVec<RunAllocF, RunDeallocF>
+ where
+ RunAllocF: Fn(usize) -> *mut TimSortRun,
+ RunDeallocF: Fn(*mut TimSortRun, usize),
+ {
+ fn new(run_alloc_fn: RunAllocF, run_dealloc_fn: RunDeallocF) -> Self {
+ // Most slices can be sorted with at most 16 runs in-flight.
+ const START_RUN_CAPACITY: usize = 16;
+
+ Self {
+ buf_ptr: run_alloc_fn(START_RUN_CAPACITY),
+ capacity: START_RUN_CAPACITY,
+ len: 0,
+ run_alloc_fn,
+ run_dealloc_fn,
+ }
+ }
+
+ fn push(&mut self, val: TimSortRun) {
+ if self.len == self.capacity {
+ let old_capacity = self.capacity;
+ let old_buf_ptr = self.buf_ptr;
+
+ self.capacity = self.capacity * 2;
+ self.buf_ptr = (self.run_alloc_fn)(self.capacity);
+
+ // SAFETY: buf_ptr new and old were correctly allocated and old_buf_ptr has
+ // old_capacity valid elements.
+ unsafe {
+ ptr::copy_nonoverlapping(old_buf_ptr, self.buf_ptr, old_capacity);
+ }
+
+ (self.run_dealloc_fn)(old_buf_ptr, old_capacity);
+ }
+
+ // SAFETY: The invariant was just checked.
+ unsafe {
+ self.buf_ptr.add(self.len).write(val);
+ }
+ self.len += 1;
+ }
+
+ fn remove(&mut self, index: usize) {
+ if index >= self.len {
+ panic!("Index out of bounds");
+ }
+
+ // SAFETY: buf_ptr needs to be valid and len invariant upheld.
+ unsafe {
+ // the place we are taking from.
+ let ptr = self.buf_ptr.add(index);
+
+ // Shift everything down to fill in that spot.
+ ptr::copy(ptr.add(1), ptr, self.len - index - 1);
+ }
+ self.len -= 1;
+ }
+
+ fn as_slice(&self) -> &[TimSortRun] {
+ // SAFETY: Safe as long as buf_ptr is valid and len invariant was upheld.
+ unsafe { &*ptr::slice_from_raw_parts(self.buf_ptr, self.len) }
+ }
+
+ fn len(&self) -> usize {
+ self.len
+ }
+ }
+
+ impl<RunAllocF, RunDeallocF> core::ops::Index<usize> for RunVec<RunAllocF, RunDeallocF>
+ where
+ RunAllocF: Fn(usize) -> *mut TimSortRun,
+ RunDeallocF: Fn(*mut TimSortRun, usize),
+ {
+ type Output = TimSortRun;
+
+ fn index(&self, index: usize) -> &Self::Output {
+ if index < self.len {
+ // SAFETY: buf_ptr and len invariant must be upheld.
+ unsafe {
+ return &*(self.buf_ptr.add(index));
+ }
+ }
+
+ panic!("Index out of bounds");
+ }
+ }
+
+ impl<RunAllocF, RunDeallocF> core::ops::IndexMut<usize> for RunVec<RunAllocF, RunDeallocF>
+ where
+ RunAllocF: Fn(usize) -> *mut TimSortRun,
+ RunDeallocF: Fn(*mut TimSortRun, usize),
+ {
+ fn index_mut(&mut self, index: usize) -> &mut Self::Output {
+ if index < self.len {
+ // SAFETY: buf_ptr and len invariant must be upheld.
+ unsafe {
+ return &mut *(self.buf_ptr.add(index));
+ }
+ }
+
+ panic!("Index out of bounds");
+ }
+ }
+
+ impl<RunAllocF, RunDeallocF> Drop for RunVec<RunAllocF, RunDeallocF>
+ where
+ RunAllocF: Fn(usize) -> *mut TimSortRun,
+ RunDeallocF: Fn(*mut TimSortRun, usize),
+ {
+ fn drop(&mut self) {
+ // As long as TimSortRun is Copy we don't need to drop them individually but just the
+ // whole allocation.
+ (self.run_dealloc_fn)(self.buf_ptr, self.capacity);
+ }
+ }
+}
+
+/// Internal type used by merge_sort.
+#[derive(Clone, Copy, Debug)]
+pub struct TimSortRun {
+ len: usize,
+ start: usize,
+}