subspace_proof_of_space/chiapos/
table.rs

1//! `chiapos` table implementation.
2
3#[cfg(any(feature = "alloc", test))]
4mod rmap;
5#[cfg(test)]
6mod tests;
7pub(super) mod types;
8
9use crate::chiapos::Seed;
10use crate::chiapos::constants::{PARAM_B, PARAM_BC, PARAM_C, PARAM_EXT, PARAM_M};
11#[cfg(feature = "alloc")]
12use crate::chiapos::table::rmap::Rmap;
13use crate::chiapos::table::types::{Metadata, X, Y};
14#[cfg(feature = "alloc")]
15use crate::chiapos::table::types::{Position, R};
16use crate::chiapos::utils::EvaluatableUsize;
17use ab_chacha8::{ChaCha8Block, ChaCha8State};
18#[cfg(feature = "alloc")]
19use alloc::boxed::Box;
20#[cfg(feature = "alloc")]
21use alloc::vec;
22#[cfg(feature = "alloc")]
23use alloc::vec::Vec;
24#[cfg(feature = "alloc")]
25use chacha20::cipher::{Iv, KeyIvInit, StreamCipher};
26#[cfg(feature = "alloc")]
27use chacha20::{ChaCha8, Key};
28use core::array;
29#[cfg(feature = "parallel")]
30use core::cell::SyncUnsafeCell;
31#[cfg(feature = "alloc")]
32use core::mem;
33#[cfg(feature = "alloc")]
34use core::mem::MaybeUninit;
35#[cfg(any(feature = "alloc", test))]
36use core::simd::prelude::*;
37#[cfg(feature = "parallel")]
38use core::sync::atomic::{AtomicUsize, Ordering};
39#[cfg(feature = "alloc")]
40use derive_more::Deref;
41#[cfg(any(feature = "alloc", test))]
42use seq_macro::seq;
43// TODO: Switch to `rclite` once https://github.com/fereidani/rclite/issues/11 is resolved
44#[cfg(feature = "alloc")]
45use alloc::sync::Arc;
46use subspace_core_primitives::hashes::blake3_hash;
47
48pub(super) const COMPUTE_F1_SIMD_FACTOR: usize = 8;
49#[cfg(any(feature = "alloc", test))]
50const COMPUTE_FN_SIMD_FACTOR: usize = 16;
51#[allow(dead_code, reason = "unused when crate is compiled separately")]
52const MAX_BUCKET_SIZE: usize = 512;
53#[cfg(feature = "alloc")]
54const BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS: u8 = 128;
55/// Reducing bucket size for better performance.
56///
57/// The number should be sufficient to produce enough proofs for sector encoding with high
58/// probability.
59// TODO: Statistical analysis if possible, confirming there will be enough proofs
60#[allow(dead_code, reason = "unused when crate is compiled separately")]
61const REDUCED_BUCKET_SIZE: usize = 272;
62/// Reducing matches count for better performance.
63///
64/// The number should be sufficient to produce enough proofs for sector encoding with high
65/// probability.
66// TODO: Statistical analysis if possible, confirming there will be enough proofs
67#[allow(dead_code, reason = "unused when crate is compiled separately")]
68const REDUCED_MATCHES_COUNT: usize = 288;
69#[cfg(feature = "parallel")]
70const CACHE_LINE_SIZE: usize = 64;
71
72const _: () = {
73    debug_assert!(REDUCED_BUCKET_SIZE <= MAX_BUCKET_SIZE);
74    debug_assert!(REDUCED_MATCHES_COUNT <= MAX_BUCKET_SIZE);
75};
76
77/// Compute the size of `y` in bits
78pub(super) const fn y_size_bits(k: u8) -> usize {
79    k as usize + PARAM_EXT as usize
80}
81
82/// Metadata size in bytes
83pub const fn metadata_size_bytes(k: u8, table_number: u8) -> usize {
84    metadata_size_bits(k, table_number).div_ceil(u8::BITS as usize)
85}
86
87/// Metadata size in bits
88pub(super) const fn metadata_size_bits(k: u8, table_number: u8) -> usize {
89    k as usize
90        * match table_number {
91            1 => 1,
92            2 => 2,
93            3 | 4 => 4,
94            5 => 3,
95            6 => 2,
96            7 => 0,
97            _ => unreachable!(),
98        }
99}
100
101/// Number of buckets for a given `k`
102pub const fn num_buckets(k: u8) -> usize {
103    2_usize
104        .pow(y_size_bits(k) as u32)
105        .div_ceil(PARAM_BC as usize)
106}
107
108#[cfg(feature = "parallel")]
109#[inline(always)]
110fn strip_sync_unsafe_cell<const N: usize, T>(value: Box<[SyncUnsafeCell<T>; N]>) -> Box<[T; N]> {
111    // SAFETY: `SyncUnsafeCell` has the same layout as `T`
112    unsafe { Box::from_raw(Box::into_raw(value).cast()) }
113}
114
115/// ChaCha8 [`Vec`] sufficient for the whole first table for [`K`].
116/// Prefer [`partial_y`] if you need partial y just for a single `x`.
117#[cfg(feature = "alloc")]
118fn partial_ys<const K: u8>(seed: Seed) -> Vec<u8> {
119    let output_len_bits = usize::from(K) * (1 << K);
120    let mut output = vec![0; output_len_bits.div_ceil(u8::BITS as usize)];
121
122    let key = Key::from(seed);
123    let iv = Iv::<ChaCha8>::default();
124
125    let mut cipher = ChaCha8::new(&key, &iv);
126
127    cipher.apply_keystream(&mut output);
128
129    output
130}
131
132/// Calculate a probabilistic upper bound on the Chia bucket size for a given `k` and
133/// `security_bits` (security level).
134///
135/// This is based on a Chernoff bound for the Poisson distribution with mean
136/// `lambda = PARAM_BC / 2^PARAM_EXT`, ensuring the probability that any bucket exceeds the bound is
137/// less than `2^{-security_bits}`.
138/// The bound is lambda + ceil(sqrt(3 * lambda * (k + security_bits) * ln(2))).
139#[cfg(feature = "alloc")]
140const fn bucket_size_upper_bound(k: u8, security_bits: u8) -> usize {
141    // Lambda is the expected number of entries in a bucket, approximated as
142    // `PARAM_BC / 2^PARAM_EXT`. It is independent of `k`.
143    const LAMBDA: u64 = PARAM_BC as u64 / 2u64.pow(PARAM_EXT as u32);
144    // Approximation of ln(2) as a fraction: ln(2) ≈ LN2_NUM / LN2_DEN.
145    // This allows integer-only computation of the square root term involving ln(2).
146    const LN2_NUM: u128 = 693147;
147    const LN2_DEN: u128 = 1000000;
148
149    // `k + security_bits` for the union bound over ~2^k intervals
150    let ks = k as u128 + security_bits as u128;
151    // Compute numerator for the expression under the square root:
152    // `3 * lambda * (k + security_bits) * LN2_NUM`
153    let num = 3u128 * LAMBDA as u128 * ks * LN2_NUM;
154    // Denominator for ln(2): `LN2_DEN`
155    let den = LN2_DEN;
156
157    let ceil_div: u128 = num.div_ceil(den);
158
159    // Binary search to find the smallest `x` such that `x * x * den >= num`,
160    // which computes `ceil(sqrt(num / den))` without floating-point.
161    // We use a custom binary search over `u64` range because binary search in the standard library
162    // operates on sorted slices, not directly on integer ranges for solving inequalities like this.
163    let mut low = 0u64;
164    let mut high = u64::MAX;
165    while low < high {
166        let mid = low + (high - low) / 2;
167        let left = (mid as u128) * (mid as u128);
168        if left >= ceil_div {
169            high = mid;
170        } else {
171            low = mid + 1;
172        }
173    }
174    let add_term = low;
175
176    (LAMBDA + add_term) as usize
177}
178
179#[cfg(feature = "alloc")]
180fn group_by_buckets<const K: u8>(
181    ys: &[Y],
182) -> Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>
183where
184    [(); num_buckets(K)]:,
185{
186    let mut bucket_offsets = [0_u16; num_buckets(K)];
187    // SAFETY: Contents is `MaybeUninit`
188    let mut buckets = unsafe {
189        Box::<[[MaybeUninit<(Position, Y)>; REDUCED_BUCKET_SIZE]; num_buckets(K)]>::new_uninit()
190            .assume_init()
191    };
192
193    for (&y, position) in ys.iter().zip(Position::ZERO..) {
194        let bucket_index = (u32::from(y) / u32::from(PARAM_BC)) as usize;
195
196        // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
197        let bucket_offset = unsafe { bucket_offsets.get_unchecked_mut(bucket_index) };
198        // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
199        let bucket = unsafe { buckets.get_unchecked_mut(bucket_index) };
200
201        if *bucket_offset < REDUCED_BUCKET_SIZE as u16 {
202            bucket[*bucket_offset as usize].write((position, y));
203            *bucket_offset += 1;
204        }
205    }
206
207    for (bucket, initialized) in buckets.iter_mut().zip(bucket_offsets) {
208        bucket[usize::from(initialized)..].write_filled((Position::SENTINEL, Y::SENTINEL));
209    }
210
211    // SAFETY: All entries are initialized
212    unsafe { Box::from_raw(Box::into_raw(buckets).cast()) }
213}
214
215/// Similar to [`group_by_buckets()`], but processes buckets instead of a flat list of `y`s.
216///
217/// # Safety
218/// Iterator item is a list of potentially uninitialized `y`s and a number of initialized `y`s. The
219/// number of initialized `y`s must be correct.
220#[cfg(feature = "parallel")]
221unsafe fn group_by_buckets_from_buckets<'a, const K: u8, I>(
222    iter: I,
223) -> Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>
224where
225    I: Iterator<Item = (&'a [MaybeUninit<Y>; REDUCED_MATCHES_COUNT], usize)> + 'a,
226    [(); num_buckets(K)]:,
227{
228    let mut bucket_offsets = [0_u16; num_buckets(K)];
229    // SAFETY: Contents is `MaybeUninit`
230    let mut buckets = unsafe {
231        Box::<[[MaybeUninit<(Position, Y)>; REDUCED_BUCKET_SIZE]; num_buckets(K)]>::new_uninit()
232            .assume_init()
233    };
234
235    for ((ys, count), batch_start) in iter.zip((Position::ZERO..).step_by(REDUCED_MATCHES_COUNT)) {
236        // SAFETY: Function contract guarantees that `y`s are initialized
237        let ys = unsafe { ys[..count].assume_init_ref() };
238        for (&y, position) in ys.iter().zip(batch_start..) {
239            let bucket_index = (u32::from(y) / u32::from(PARAM_BC)) as usize;
240
241            // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
242            let bucket_offset = unsafe { bucket_offsets.get_unchecked_mut(bucket_index) };
243            // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
244            let bucket = unsafe { buckets.get_unchecked_mut(bucket_index) };
245
246            if *bucket_offset < REDUCED_BUCKET_SIZE as u16 {
247                bucket[*bucket_offset as usize].write((position, y));
248                *bucket_offset += 1;
249            }
250        }
251    }
252
253    for (bucket, initialized) in buckets.iter_mut().zip(bucket_offsets) {
254        bucket[usize::from(initialized)..].write_filled((Position::SENTINEL, Y::SENTINEL));
255    }
256
257    // SAFETY: All entries are initialized
258    unsafe { Box::from_raw(Box::into_raw(buckets).cast()) }
259}
260
261#[cfg(feature = "alloc")]
262#[derive(Debug, Copy, Clone, Deref)]
263#[repr(align(64))]
264struct CacheLineAligned<T>(T);
265
266/// Mapping from `parity` to `r` to `m`
267#[cfg(feature = "alloc")]
268type LeftTargets = [[CacheLineAligned<[R; PARAM_M as usize]>; PARAM_BC as usize]; 2];
269
270#[cfg(feature = "alloc")]
271fn calculate_left_targets() -> Arc<LeftTargets> {
272    let mut left_targets = Arc::<LeftTargets>::new_uninit();
273    // SAFETY: Same layout and uninitialized in both cases
274    let left_targets_slice = unsafe {
275        mem::transmute::<
276            &mut MaybeUninit<[[CacheLineAligned<[R; PARAM_M as usize]>; PARAM_BC as usize]; 2]>,
277            &mut [[MaybeUninit<CacheLineAligned<[R; PARAM_M as usize]>>; PARAM_BC as usize]; 2],
278        >(Arc::get_mut_unchecked(&mut left_targets))
279    };
280
281    for parity in 0..=1 {
282        for r in 0..PARAM_BC {
283            let c = r / PARAM_C;
284
285            let mut arr = array::from_fn(|m| {
286                let m = m as u16;
287                R::from(
288                    ((c + m) % PARAM_B) * PARAM_C
289                        + (((2 * m + parity) * (2 * m + parity) + r) % PARAM_C),
290                )
291            });
292            arr.sort_unstable();
293            left_targets_slice[parity as usize][r as usize].write(CacheLineAligned(arr));
294        }
295    }
296
297    // SAFETY: Initialized all entries
298    unsafe { left_targets.assume_init() }
299}
300
301fn calculate_left_target_on_demand(parity: u32, r: u32, m: u32) -> u32 {
302    let param_b = u32::from(PARAM_B);
303    let param_c = u32::from(PARAM_C);
304
305    ((r / param_c + m) % param_b) * param_c + (((2 * m + parity) * (2 * m + parity) + r) % param_c)
306}
307
308/// Caches that can be used to optimize the creation of multiple [`Tables`](super::Tables).
309#[cfg(feature = "alloc")]
310#[derive(Debug, Clone)]
311pub struct TablesCache {
312    left_targets: Arc<LeftTargets>,
313}
314
315#[cfg(feature = "alloc")]
316impl Default for TablesCache {
317    /// Create a new instance
318    fn default() -> Self {
319        Self {
320            left_targets: calculate_left_targets(),
321        }
322    }
323}
324
325#[cfg(feature = "alloc")]
326#[derive(Debug, Copy, Clone)]
327struct Match {
328    left_position: Position,
329    left_y: Y,
330    right_position: Position,
331}
332
333/// `partial_y_offset` is in bits within `partial_y`
334pub(super) fn compute_f1<const K: u8>(x: X, seed: &Seed) -> Y {
335    let skip_bits = u32::from(K) * u32::from(x);
336    let skip_u32s = skip_bits / u32::BITS;
337    let partial_y_offset = skip_bits % u32::BITS;
338
339    const U32S_PER_BLOCK: usize = size_of::<ChaCha8Block>() / size_of::<u32>();
340
341    let initial_state = ChaCha8State::init(seed, &[0; _]);
342    let first_block_counter = skip_u32s / U32S_PER_BLOCK as u32;
343    let u32_in_first_block = skip_u32s as usize % U32S_PER_BLOCK;
344
345    let first_block = initial_state.compute_block(first_block_counter);
346    let hi = first_block[u32_in_first_block].to_be();
347
348    // TODO: Is SIMD version of `compute_block()` that produces two blocks at once possible?
349    let lo = if u32_in_first_block + 1 == U32S_PER_BLOCK {
350        // Spilled over into the second block
351        let second_block = initial_state.compute_block(first_block_counter + 1);
352        second_block[0].to_be()
353    } else {
354        first_block[u32_in_first_block + 1].to_be()
355    };
356
357    let partial_y = (u64::from(hi) << u32::BITS) | u64::from(lo);
358
359    let pre_y = partial_y >> (u64::BITS - u32::from(K + PARAM_EXT) - partial_y_offset);
360    let pre_y = pre_y as u32;
361    // Mask for clearing the rest of bits of `pre_y`.
362    let pre_y_mask = (u32::MAX << PARAM_EXT) & (u32::MAX >> (u32::BITS - u32::from(K + PARAM_EXT)));
363
364    // Extract `PARAM_EXT` most significant bits from `x` and store in the final offset of
365    // eventual `y` with the rest of bits being zero (`x` is `0..2^K`)
366    let pre_ext = u32::from(x) >> (K - PARAM_EXT);
367
368    // Combine all of the bits together:
369    // [padding zero bits][`K` bits rom `partial_y`][`PARAM_EXT` bits from `x`]
370    Y::from((pre_y & pre_y_mask) | pre_ext)
371}
372
373#[cfg(any(feature = "alloc", test))]
374pub(super) fn compute_f1_simd<const K: u8>(
375    xs: Simd<u32, COMPUTE_F1_SIMD_FACTOR>,
376    partial_ys: &[u8; K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize],
377) -> [Y; COMPUTE_F1_SIMD_FACTOR] {
378    // Each element contains `K` desired bits of `partial_ys` in the final offset of eventual `ys`
379    // with the rest of bits being in undefined state
380    let pre_ys_bytes = array::from_fn(|i| {
381        let partial_y_offset = i * usize::from(K);
382        let partial_y_length =
383            (partial_y_offset % u8::BITS as usize + usize::from(K)).div_ceil(u8::BITS as usize);
384        let mut pre_y_bytes = 0u64.to_be_bytes();
385        pre_y_bytes[..partial_y_length].copy_from_slice(
386            &partial_ys[partial_y_offset / u8::BITS as usize..][..partial_y_length],
387        );
388
389        u64::from_be_bytes(pre_y_bytes)
390    });
391    let pre_ys_right_offset = array::from_fn(|i| {
392        let partial_y_offset = i as u32 * u32::from(K);
393        u64::from(u64::BITS - u32::from(K + PARAM_EXT) - partial_y_offset % u8::BITS)
394    });
395    let pre_ys = Simd::from_array(pre_ys_bytes) >> Simd::from_array(pre_ys_right_offset);
396
397    // Mask for clearing the rest of bits of `pre_ys`.
398    let pre_ys_mask = Simd::splat(
399        (u32::MAX << usize::from(PARAM_EXT))
400            & (u32::MAX >> (u32::BITS as usize - usize::from(K + PARAM_EXT))),
401    );
402
403    // Extract `PARAM_EXT` most significant bits from `xs` and store in the final offset of
404    // eventual `ys` with the rest of bits being in undefined state.
405    let pre_exts = xs >> Simd::splat(u32::from(K - PARAM_EXT));
406
407    // Combine all of the bits together:
408    // [padding zero bits][`K` bits rom `partial_y`][`PARAM_EXT` bits from `x`]
409    let ys = (pre_ys.cast() & pre_ys_mask) | pre_exts;
410
411    Y::array_from_repr(ys.to_array())
412}
413
414/// For verification use [`has_match`] instead.
415///
416/// # Safety
417/// Left and right bucket positions must correspond to the parent table.
418// TODO: Try to reduce the `matches` size further by processing `left_bucket` in chunks (like halves
419//  for example)
420#[cfg(feature = "alloc")]
421unsafe fn find_matches_in_buckets<'a>(
422    left_bucket_index: u32,
423    left_bucket: &[(Position, Y); REDUCED_BUCKET_SIZE],
424    right_bucket: &[(Position, Y); REDUCED_BUCKET_SIZE],
425    // `PARAM_M as usize * 2` corresponds to the upper bound number of matches a single `y` in the
426    // left bucket might have here
427    matches: &'a mut [MaybeUninit<Match>; REDUCED_MATCHES_COUNT + PARAM_M as usize * 2],
428    left_targets: &LeftTargets,
429) -> &'a [Match] {
430    let left_base = left_bucket_index * u32::from(PARAM_BC);
431    let right_base = left_base + u32::from(PARAM_BC);
432
433    let mut rmap = Rmap::new();
434    for &(right_position, y) in right_bucket {
435        if right_position == Position::SENTINEL {
436            break;
437        }
438        let r = R::from((u32::from(y) - right_base) as u16);
439        // SAFETY: `r` is within `0..PARAM_BC` range by definition, the right bucket is limited to
440        // `REDUCED_BUCKETS_SIZE`
441        unsafe {
442            rmap.add(r, right_position);
443        }
444    }
445
446    let parity = left_base % 2;
447    let left_targets_parity = &left_targets[parity as usize];
448    let mut next_match_index = 0;
449
450    // TODO: Simd read for left bucket? It might be more efficient in terms of memory access to
451    //  process chunks of the left bucket against one right value for each at a time
452    for &(left_position, y) in left_bucket {
453        // `next_match_index >= REDUCED_MATCHES_COUNT` is crucial to make sure
454        if left_position == Position::SENTINEL || next_match_index >= REDUCED_MATCHES_COUNT {
455            // Sentinel values are padded to the end of the bucket
456            break;
457        }
458
459        let r = R::from((u32::from(y) - left_base) as u16);
460        // SAFETY: `r` is within a bucket and exists by definition
461        let left_targets_r = unsafe { left_targets_parity.get_unchecked(usize::from(r)) };
462
463        for &r_target in left_targets_r.iter() {
464            // SAFETY: Targets are always limited to `PARAM_BC`
465            let [right_position_a, right_position_b] = unsafe { rmap.get(r_target) };
466
467            // The right bucket position is never zero
468            if right_position_a != Position::ZERO {
469                // SAFETY: Iteration will stop before `REDUCED_MATCHES_COUNT + PARAM_M * 2`
470                // elements is inserted
471                unsafe { matches.get_unchecked_mut(next_match_index) }.write(Match {
472                    left_position,
473                    left_y: y,
474                    right_position: right_position_a,
475                });
476                next_match_index += 1;
477
478                if right_position_b != Position::ZERO {
479                    // SAFETY: Iteration will stop before
480                    // `REDUCED_MATCHES_COUNT + PARAM_M * 2` elements is inserted
481                    unsafe { matches.get_unchecked_mut(next_match_index) }.write(Match {
482                        left_position,
483                        left_y: y,
484                        right_position: right_position_b,
485                    });
486                    next_match_index += 1;
487                }
488            }
489        }
490    }
491
492    // SAFETY: Initialized this many matches
493    unsafe { matches[..next_match_index].assume_init_ref() }
494}
495
496/// Simplified version of [`find_matches_in_buckets`] for verification purposes.
497pub(super) fn has_match(left_y: Y, right_y: Y) -> bool {
498    let right_r = u32::from(right_y) % u32::from(PARAM_BC);
499    let parity = (u32::from(left_y) / u32::from(PARAM_BC)) % 2;
500    let left_r = u32::from(left_y) % u32::from(PARAM_BC);
501
502    let r_targets = array::from_fn::<_, { PARAM_M as usize }, _>(|i| {
503        calculate_left_target_on_demand(parity, left_r, i as u32)
504    });
505
506    r_targets.contains(&right_r)
507}
508
509#[inline(always)]
510pub(super) fn compute_fn<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
511    y: Y,
512    left_metadata: Metadata<K, PARENT_TABLE_NUMBER>,
513    right_metadata: Metadata<K, PARENT_TABLE_NUMBER>,
514) -> (Y, Metadata<K, TABLE_NUMBER>)
515where
516    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
517    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
518{
519    let left_metadata = u128::from(left_metadata);
520    let right_metadata = u128::from(right_metadata);
521
522    let parent_metadata_bits = metadata_size_bits(K, PARENT_TABLE_NUMBER);
523
524    // Part of the `right_bits` at the final offset of eventual `input_a`
525    let y_and_left_bits = y_size_bits(K) + parent_metadata_bits;
526    let right_bits_start_offset = u128::BITS as usize - parent_metadata_bits;
527
528    // Take only bytes where bits were set
529    let num_bytes_with_data =
530        (y_size_bits(K) + parent_metadata_bits * 2).div_ceil(u8::BITS as usize);
531
532    // Only supports `K` from 15 to 25 (otherwise math will not be correct when concatenating y,
533    // left metadata and right metadata)
534    let hash = {
535        // Collect `K` most significant bits of `y` at the final offset of eventual `input_a`
536        let y_bits = u128::from(y) << (u128::BITS as usize - y_size_bits(K));
537
538        // Move bits of `left_metadata` at the final offset of eventual `input_a`
539        let left_metadata_bits =
540            left_metadata << (u128::BITS as usize - parent_metadata_bits - y_size_bits(K));
541
542        // If `right_metadata` bits start to the left of the desired position in `input_a` move
543        // bits right, else move left
544        if right_bits_start_offset < y_and_left_bits {
545            let right_bits_pushed_into_input_b = y_and_left_bits - right_bits_start_offset;
546            // Collect bits of `right_metadata` that will fit into `input_a` at the final offset in
547            // eventual `input_a`
548            let right_bits_a = right_metadata >> right_bits_pushed_into_input_b;
549            let input_a = y_bits | left_metadata_bits | right_bits_a;
550            // Collect bits of `right_metadata` that will spill over into `input_b`
551            let input_b = right_metadata << (u128::BITS as usize - right_bits_pushed_into_input_b);
552
553            let input = [input_a.to_be_bytes(), input_b.to_be_bytes()];
554            let input_len =
555                size_of::<u128>() + right_bits_pushed_into_input_b.div_ceil(u8::BITS as usize);
556            blake3_hash(&input.as_flattened()[..input_len])
557        } else {
558            let right_bits_a = right_metadata << (right_bits_start_offset - y_and_left_bits);
559            let input_a = y_bits | left_metadata_bits | right_bits_a;
560
561            blake3_hash(&input_a.to_be_bytes()[..num_bytes_with_data])
562        }
563    };
564
565    let y_output = Y::from(
566        u32::from_be_bytes([hash[0], hash[1], hash[2], hash[3]])
567            >> (u32::BITS as usize - y_size_bits(K)),
568    );
569
570    let metadata_size_bits = metadata_size_bits(K, TABLE_NUMBER);
571
572    let metadata = if TABLE_NUMBER < 4 {
573        (left_metadata << parent_metadata_bits) | right_metadata
574    } else if metadata_size_bits > 0 {
575        // For K up to 25 it is guaranteed that metadata + bit offset will always fit into u128.
576        // We collect the bytes necessary, potentially with extra bits at the start and end of the
577        // bytes that will be taken care of later.
578        let metadata = u128::from_be_bytes(
579            hash[y_size_bits(K) / u8::BITS as usize..][..size_of::<u128>()]
580                .try_into()
581                .expect("Always enough bits for any K; qed"),
582        );
583        // Remove extra bits at the beginning
584        let metadata = metadata << (y_size_bits(K) % u8::BITS as usize);
585        // Move bits into the correct location
586        metadata >> (u128::BITS as usize - metadata_size_bits)
587    } else {
588        0
589    };
590
591    (y_output, Metadata::from(metadata))
592}
593
594// TODO: This is actually using only pipelining rather than real SIMD (at least explicitly) due to:
595//  * https://github.com/rust-lang/portable-simd/issues/108
596//  * https://github.com/BLAKE3-team/BLAKE3/issues/478#issuecomment-3200106103
597#[cfg(any(feature = "alloc", test))]
598fn compute_fn_simd<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
599    left_ys: [Y; COMPUTE_FN_SIMD_FACTOR],
600    left_metadatas: [Metadata<K, PARENT_TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
601    right_metadatas: [Metadata<K, PARENT_TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
602) -> (
603    [Y; COMPUTE_FN_SIMD_FACTOR],
604    [Metadata<K, TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
605)
606where
607    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
608    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
609{
610    let parent_metadata_bits = metadata_size_bits(K, PARENT_TABLE_NUMBER);
611    let metadata_size_bits = metadata_size_bits(K, TABLE_NUMBER);
612
613    // TODO: `u128` is not supported as SIMD element yet, see
614    //  https://github.com/rust-lang/portable-simd/issues/108
615    let left_metadatas: [u128; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
616        [
617        #(
618            u128::from(left_metadatas[N]),
619        )*
620        ]
621    });
622    let right_metadatas: [u128; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
623        [
624        #(
625            u128::from(right_metadatas[N]),
626        )*
627        ]
628    });
629
630    // Part of the `right_bits` at the final offset of eventual `input_a`
631    let y_and_left_bits = y_size_bits(K) + parent_metadata_bits;
632    let right_bits_start_offset = u128::BITS as usize - parent_metadata_bits;
633
634    // Take only bytes where bits were set
635    let num_bytes_with_data =
636        (y_size_bits(K) + parent_metadata_bits * 2).div_ceil(u8::BITS as usize);
637
638    // Only supports `K` from 15 to 25 (otherwise math will not be correct when concatenating y,
639    // left metadata and right metadata)
640    // TODO: SIMD hashing once this is possible:
641    //  https://github.com/BLAKE3-team/BLAKE3/issues/478#issuecomment-3200106103
642    let hashes: [_; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
643        [
644        #(
645        {
646            let y = left_ys[N];
647            let left_metadata = left_metadatas[N];
648            let right_metadata = right_metadatas[N];
649
650            // Collect `K` most significant bits of `y` at the final offset of eventual
651            // `input_a`
652            let y_bits = u128::from(y) << (u128::BITS as usize - y_size_bits(K));
653
654            // Move bits of `left_metadata` at the final offset of eventual `input_a`
655            let left_metadata_bits =
656                left_metadata << (u128::BITS as usize - parent_metadata_bits - y_size_bits(K));
657
658            // If `right_metadata` bits start to the left of the desired position in `input_a` move
659            // bits right, else move left
660            if right_bits_start_offset < y_and_left_bits {
661                let right_bits_pushed_into_input_b = y_and_left_bits - right_bits_start_offset;
662                // Collect bits of `right_metadata` that will fit into `input_a` at the final offset
663                // in eventual `input_a`
664                let right_bits_a = right_metadata >> right_bits_pushed_into_input_b;
665                let input_a = y_bits | left_metadata_bits | right_bits_a;
666                // Collect bits of `right_metadata` that will spill over into `input_b`
667                let input_b = right_metadata << (u128::BITS as usize - right_bits_pushed_into_input_b);
668
669                let input = [input_a.to_be_bytes(), input_b.to_be_bytes()];
670                let input_len =
671                    size_of::<u128>() + right_bits_pushed_into_input_b.div_ceil(u8::BITS as usize);
672                blake3_hash(&input.as_flattened()[..input_len])
673            } else {
674                let right_bits_a = right_metadata << (right_bits_start_offset - y_and_left_bits);
675                let input_a = y_bits | left_metadata_bits | right_bits_a;
676
677                blake3_hash(&input_a.to_be_bytes()[..num_bytes_with_data])
678            }
679        },
680        )*
681        ]
682    });
683
684    let y_outputs = Simd::from_array(
685        hashes.map(|hash| u32::from_be_bytes([hash[0], hash[1], hash[2], hash[3]])),
686    ) >> (u32::BITS - y_size_bits(K) as u32);
687    let y_outputs = Y::array_from_repr(y_outputs.to_array());
688
689    let metadatas = if TABLE_NUMBER < 4 {
690        seq!(N in 0..16 {
691            [
692            #(
693                Metadata::from((left_metadatas[N] << parent_metadata_bits) | right_metadatas[N]),
694            )*
695            ]
696        })
697    } else if metadata_size_bits > 0 {
698        // For K up to 25 it is guaranteed that metadata + bit offset will always fit into u128.
699        // We collect the bytes necessary, potentially with extra bits at the start and end of the
700        // bytes that will be taken care of later.
701        seq!(N in 0..16 {
702            [
703            #(
704            {
705                let metadata = u128::from_be_bytes(
706                    hashes[N][y_size_bits(K) / u8::BITS as usize..][..size_of::<u128>()]
707                        .try_into()
708                        .expect("Always enough bits for any K; qed"),
709                );
710                // Remove extra bits at the beginning
711                let metadata = metadata << (y_size_bits(K) % u8::BITS as usize);
712                // Move bits into the correct location
713                Metadata::from(metadata >> (u128::BITS as usize - metadata_size_bits))
714            },
715            )*
716            ]
717        })
718    } else {
719        [Metadata::default(); _]
720    };
721
722    (y_outputs, metadatas)
723}
724
725/// # Safety
726/// `m` must contain positions that correspond to the parent table
727#[cfg(feature = "alloc")]
728unsafe fn match_to_result<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
729    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
730    m: &Match,
731) -> (Y, [Position; 2], Metadata<K, TABLE_NUMBER>)
732where
733    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
734    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
735    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
736    [(); 1 << K]:,
737    [(); num_buckets(K)]:,
738    [(); num_buckets(K) - 1]:,
739{
740    // SAFETY: Guaranteed by function contract
741    let left_metadata = unsafe { parent_table.metadata(m.left_position) };
742    // SAFETY: Guaranteed by function contract
743    let right_metadata = unsafe { parent_table.metadata(m.right_position) };
744
745    let (y, metadata) =
746        compute_fn::<K, TABLE_NUMBER, PARENT_TABLE_NUMBER>(m.left_y, left_metadata, right_metadata);
747
748    (y, [m.left_position, m.right_position], metadata)
749}
750
751/// # Safety
752/// `matches` must contain positions that correspond to the parent table
753#[cfg(feature = "alloc")]
754#[inline(always)]
755unsafe fn match_to_result_simd<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
756    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
757    matches: &[Match; COMPUTE_FN_SIMD_FACTOR],
758) -> (
759    [Y; COMPUTE_FN_SIMD_FACTOR],
760    [[Position; 2]; COMPUTE_FN_SIMD_FACTOR],
761    [Metadata<K, TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
762)
763where
764    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
765    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
766    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
767    [(); 1 << K]:,
768    [(); num_buckets(K)]:,
769    [(); num_buckets(K) - 1]:,
770{
771    let left_ys: [_; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
772        [
773        #(
774            matches[N].left_y,
775        )*
776        ]
777    });
778    // SAFETY: Guaranteed by function contract
779    let left_metadatas: [_; COMPUTE_FN_SIMD_FACTOR] = unsafe {
780        seq!(N in 0..16 {
781            [
782            #(
783                parent_table.metadata(matches[N].left_position),
784            )*
785            ]
786        })
787    };
788    // SAFETY: Guaranteed by function contract
789    let right_metadatas: [_; COMPUTE_FN_SIMD_FACTOR] = unsafe {
790        seq!(N in 0..16 {
791            [
792            #(
793                parent_table.metadata(matches[N].right_position),
794            )*
795            ]
796        })
797    };
798
799    let (y_outputs, metadatas) = compute_fn_simd::<K, TABLE_NUMBER, PARENT_TABLE_NUMBER>(
800        left_ys,
801        left_metadatas,
802        right_metadatas,
803    );
804
805    let positions = seq!(N in 0..16 {
806        [
807        #(
808            [
809                matches[N].left_position,
810                matches[N].right_position,
811            ],
812        )*
813        ]
814    });
815
816    (y_outputs, positions, metadatas)
817}
818
819/// # Safety
820/// `matches` must contain positions that correspond to the parent table. `ys`, `position` and
821/// `metadatas` length must be at least the length of `matches`
822#[cfg(feature = "alloc")]
823#[inline(always)]
824unsafe fn matches_to_results<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
825    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
826    matches: &[Match],
827    ys: &mut [MaybeUninit<Y>],
828    positions: &mut [MaybeUninit<[Position; 2]>],
829    metadatas: &mut [MaybeUninit<Metadata<K, TABLE_NUMBER>>],
830) where
831    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
832    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
833    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
834    [(); 1 << K]:,
835    [(); num_buckets(K)]:,
836    [(); num_buckets(K) - 1]:,
837{
838    let (grouped_matches, other_matches) = matches.as_chunks::<COMPUTE_FN_SIMD_FACTOR>();
839    let (grouped_ys, other_ys) = ys.split_at_mut(grouped_matches.as_flattened().len());
840    let grouped_ys = grouped_ys.as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>().0;
841    let (grouped_positions, other_positions) =
842        positions.split_at_mut(grouped_matches.as_flattened().len());
843    let grouped_positions = grouped_positions
844        .as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>()
845        .0;
846    let (grouped_metadatas, other_metadatas) =
847        metadatas.split_at_mut(grouped_matches.as_flattened().len());
848    let grouped_metadatas = grouped_metadatas
849        .as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>()
850        .0;
851
852    for (((grouped_matches, grouped_ys), grouped_positions), grouped_metadatas) in grouped_matches
853        .iter()
854        .zip(grouped_ys)
855        .zip(grouped_positions)
856        .zip(grouped_metadatas)
857    {
858        // SAFETY: Guaranteed by function contract
859        let (ys_group, positions_group, metadatas_group) =
860            unsafe { match_to_result_simd(parent_table, grouped_matches) };
861        grouped_ys.write_copy_of_slice(&ys_group);
862        grouped_positions.write_copy_of_slice(&positions_group);
863
864        // The last table doesn't have metadata
865        if metadata_size_bits(K, TABLE_NUMBER) > 0 {
866            grouped_metadatas.write_copy_of_slice(&metadatas_group);
867        }
868    }
869    for (((other_match, other_y), other_positions), other_metadata) in other_matches
870        .iter()
871        .zip(other_ys)
872        .zip(other_positions)
873        .zip(other_metadatas)
874    {
875        // SAFETY: Guaranteed by function contract
876        let (y, p, metadata) = unsafe { match_to_result(parent_table, other_match) };
877        other_y.write(y);
878        other_positions.write(p);
879        // The last table doesn't have metadata
880        if metadata_size_bits(K, TABLE_NUMBER) > 0 {
881            other_metadata.write(metadata);
882        }
883    }
884}
885
886/// Similar to [`Table`], but smaller size for later processing stages
887#[cfg(feature = "alloc")]
888#[derive(Debug)]
889pub(super) enum PrunedTable<const K: u8, const TABLE_NUMBER: u8>
890where
891    [(); 1 << K]:,
892    [(); num_buckets(K) - 1]:,
893{
894    First,
895    /// Other tables
896    Other {
897        /// Left and right entry positions in a previous table encoded into bits
898        positions: Box<[MaybeUninit<[Position; 2]>; 1 << K]>,
899    },
900    /// Other tables
901    #[cfg(feature = "parallel")]
902    OtherBuckets {
903        /// Left and right entry positions in a previous table encoded into bits.
904        ///
905        /// Only positions from the `buckets` field are guaranteed to be initialized.
906        positions: Box<[[MaybeUninit<[Position; 2]>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1]>,
907    },
908}
909
910#[cfg(feature = "alloc")]
911impl<const K: u8, const TABLE_NUMBER: u8> PrunedTable<K, TABLE_NUMBER>
912where
913    [(); 1 << K]:,
914    [(); num_buckets(K) - 1]:,
915{
916    /// Get `[left_position, right_position]` of a previous table for a specified position in a
917    /// current table.
918    ///
919    /// # Safety
920    /// `position` must come from [`Table::buckets()`] or [`Self::position()`] and not be a sentinel
921    /// value.
922    #[inline(always)]
923    pub(super) unsafe fn position(&self, position: Position) -> [Position; 2] {
924        match self {
925            Self::First => {
926                unreachable!("Not the first table");
927            }
928            Self::Other { positions, .. } => {
929                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
930                unsafe { positions.get_unchecked(usize::from(position)).assume_init() }
931            }
932            #[cfg(feature = "parallel")]
933            Self::OtherBuckets { positions, .. } => {
934                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
935                unsafe {
936                    positions
937                        .as_flattened()
938                        .get_unchecked(usize::from(position))
939                        .assume_init()
940                }
941            }
942        }
943    }
944}
945
946#[cfg(feature = "alloc")]
947#[derive(Debug)]
948pub(super) enum Table<const K: u8, const TABLE_NUMBER: u8>
949where
950    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
951    [(); 1 << K]:,
952    [(); num_buckets(K)]:,
953    [(); num_buckets(K) - 1]:,
954{
955    /// First table
956    First {
957        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
958        ///
959        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
960        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
961    },
962    /// Other tables
963    Other {
964        /// Left and right entry positions in a previous table encoded into bits
965        positions: Box<[MaybeUninit<[Position; 2]>; 1 << K]>,
966        /// Metadata corresponding to each entry
967        metadatas: Box<[MaybeUninit<Metadata<K, TABLE_NUMBER>>; 1 << K]>,
968        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
969        ///
970        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
971        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
972    },
973    /// Other tables
974    #[cfg(feature = "parallel")]
975    OtherBuckets {
976        /// Left and right entry positions in a previous table encoded into bits.
977        ///
978        /// Only positions from the `buckets` field are guaranteed to be initialized.
979        positions: Box<[[MaybeUninit<[Position; 2]>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1]>,
980        /// Metadata corresponding to each entry.
981        ///
982        /// Only positions from the `buckets` field are guaranteed to be initialized.
983        metadatas: Box<
984            [[MaybeUninit<Metadata<K, TABLE_NUMBER>>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1],
985        >,
986        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
987        ///
988        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
989        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
990    },
991}
992
993#[cfg(feature = "alloc")]
994impl<const K: u8> Table<K, 1>
995where
996    EvaluatableUsize<{ metadata_size_bytes(K, 1) }>: Sized,
997    [(); 1 << K]:,
998    [(); num_buckets(K)]:,
999    [(); num_buckets(K) - 1]:,
1000{
1001    /// Create the table
1002    pub(super) fn create(seed: Seed) -> Self
1003    where
1004        EvaluatableUsize<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>: Sized,
1005    {
1006        // `MAX_BUCKET_SIZE` is not actively used, but is an upper-bound reference for the other
1007        // parameters
1008        debug_assert!(
1009            MAX_BUCKET_SIZE >= bucket_size_upper_bound(K, BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS),
1010            "Max bucket size is not sufficiently large"
1011        );
1012
1013        let partial_ys = partial_ys::<K>(seed);
1014
1015        // SAFETY: Contents is `MaybeUninit`
1016        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1017
1018        for ((ys, xs_batch_start), partial_ys) in ys
1019            .as_chunks_mut::<COMPUTE_F1_SIMD_FACTOR>()
1020            .0
1021            .iter_mut()
1022            .zip((X::ZERO..).step_by(COMPUTE_F1_SIMD_FACTOR))
1023            .zip(
1024                partial_ys
1025                    .as_chunks::<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>()
1026                    .0,
1027            )
1028        {
1029            let xs = Simd::splat(u32::from(xs_batch_start))
1030                + Simd::from_array(array::from_fn(|i| i as u32));
1031            let ys_batch = compute_f1_simd::<K>(xs, partial_ys);
1032
1033            ys.write_copy_of_slice(&ys_batch);
1034        }
1035
1036        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1037        // layout, all elements were initialized
1038        let ys = unsafe {
1039            let ys_len = ys.len();
1040            let ys = Box::into_raw(ys);
1041            Vec::from_raw_parts(ys.cast(), ys_len, ys_len)
1042        };
1043
1044        // TODO: Try to group buckets in the process of collecting `y`s
1045        let buckets = group_by_buckets::<K>(&ys);
1046
1047        Self::First { buckets }
1048    }
1049
1050    /// Create the table, leverages available parallelism
1051    #[cfg(feature = "parallel")]
1052    pub(super) fn create_parallel(seed: Seed) -> Self
1053    where
1054        EvaluatableUsize<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>: Sized,
1055    {
1056        // `MAX_BUCKET_SIZE` is not actively used, but is an upper-bound reference for the other
1057        // parameters
1058        debug_assert!(
1059            MAX_BUCKET_SIZE >= bucket_size_upper_bound(K, BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS),
1060            "Max bucket size is not sufficiently large"
1061        );
1062
1063        let partial_ys = partial_ys::<K>(seed);
1064
1065        // SAFETY: Contents is `MaybeUninit`
1066        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1067
1068        // TODO: Try parallelism here?
1069        for ((ys, xs_batch_start), partial_ys) in ys
1070            .as_chunks_mut::<COMPUTE_F1_SIMD_FACTOR>()
1071            .0
1072            .iter_mut()
1073            .zip((X::ZERO..).step_by(COMPUTE_F1_SIMD_FACTOR))
1074            .zip(
1075                partial_ys
1076                    .as_chunks::<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>()
1077                    .0,
1078            )
1079        {
1080            let xs = Simd::splat(u32::from(xs_batch_start))
1081                + Simd::from_array(array::from_fn(|i| i as u32));
1082            let ys_batch = compute_f1_simd::<K>(xs, partial_ys);
1083
1084            ys.write_copy_of_slice(&ys_batch);
1085        }
1086
1087        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1088        // layout, all elements were initialized
1089        let ys = unsafe {
1090            let ys_len = ys.len();
1091            let ys = Box::into_raw(ys);
1092            Vec::from_raw_parts(ys.cast(), ys_len, ys_len)
1093        };
1094
1095        // TODO: Try to group buckets in the process of collecting `y`s
1096        let buckets = group_by_buckets::<K>(&ys);
1097
1098        Self::First { buckets }
1099    }
1100}
1101
1102#[cfg(feature = "alloc")]
1103mod private {
1104    pub(in super::super) trait SupportedOtherTables {}
1105    pub(in super::super) trait NotLastTable {}
1106}
1107
1108#[cfg(feature = "alloc")]
1109impl<const K: u8> private::SupportedOtherTables for Table<K, 2>
1110where
1111    EvaluatableUsize<{ metadata_size_bytes(K, 2) }>: Sized,
1112    [(); 1 << K]:,
1113    [(); num_buckets(K)]:,
1114    [(); num_buckets(K) - 1]:,
1115{
1116}
1117#[cfg(feature = "alloc")]
1118impl<const K: u8> private::SupportedOtherTables for Table<K, 3>
1119where
1120    EvaluatableUsize<{ metadata_size_bytes(K, 3) }>: Sized,
1121    [(); 1 << K]:,
1122    [(); num_buckets(K)]:,
1123    [(); num_buckets(K) - 1]:,
1124{
1125}
1126#[cfg(feature = "alloc")]
1127impl<const K: u8> private::SupportedOtherTables for Table<K, 4>
1128where
1129    EvaluatableUsize<{ metadata_size_bytes(K, 4) }>: Sized,
1130    [(); 1 << K]:,
1131    [(); num_buckets(K)]:,
1132    [(); num_buckets(K) - 1]:,
1133{
1134}
1135#[cfg(feature = "alloc")]
1136impl<const K: u8> private::SupportedOtherTables for Table<K, 5>
1137where
1138    EvaluatableUsize<{ metadata_size_bytes(K, 5) }>: Sized,
1139    [(); 1 << K]:,
1140    [(); num_buckets(K)]:,
1141    [(); num_buckets(K) - 1]:,
1142{
1143}
1144#[cfg(feature = "alloc")]
1145impl<const K: u8> private::SupportedOtherTables for Table<K, 6>
1146where
1147    EvaluatableUsize<{ metadata_size_bytes(K, 6) }>: Sized,
1148    [(); 1 << K]:,
1149    [(); num_buckets(K)]:,
1150    [(); num_buckets(K) - 1]:,
1151{
1152}
1153#[cfg(feature = "alloc")]
1154impl<const K: u8> private::SupportedOtherTables for Table<K, 7>
1155where
1156    EvaluatableUsize<{ metadata_size_bytes(K, 7) }>: Sized,
1157    [(); 1 << K]:,
1158    [(); num_buckets(K)]:,
1159    [(); num_buckets(K) - 1]:,
1160{
1161}
1162
1163#[cfg(feature = "alloc")]
1164impl<const K: u8> private::NotLastTable for Table<K, 1>
1165where
1166    EvaluatableUsize<{ metadata_size_bytes(K, 1) }>: Sized,
1167    [(); 1 << K]:,
1168    [(); num_buckets(K)]:,
1169    [(); num_buckets(K) - 1]:,
1170{
1171}
1172#[cfg(feature = "alloc")]
1173impl<const K: u8> private::NotLastTable for Table<K, 2>
1174where
1175    EvaluatableUsize<{ metadata_size_bytes(K, 2) }>: Sized,
1176    [(); 1 << K]:,
1177    [(); num_buckets(K)]:,
1178    [(); num_buckets(K) - 1]:,
1179{
1180}
1181#[cfg(feature = "alloc")]
1182impl<const K: u8> private::NotLastTable for Table<K, 3>
1183where
1184    EvaluatableUsize<{ metadata_size_bytes(K, 3) }>: Sized,
1185    [(); 1 << K]:,
1186    [(); num_buckets(K)]:,
1187    [(); num_buckets(K) - 1]:,
1188{
1189}
1190#[cfg(feature = "alloc")]
1191impl<const K: u8> private::NotLastTable for Table<K, 4>
1192where
1193    EvaluatableUsize<{ metadata_size_bytes(K, 4) }>: Sized,
1194    [(); 1 << K]:,
1195    [(); num_buckets(K)]:,
1196    [(); num_buckets(K) - 1]:,
1197{
1198}
1199#[cfg(feature = "alloc")]
1200impl<const K: u8> private::NotLastTable for Table<K, 5>
1201where
1202    EvaluatableUsize<{ metadata_size_bytes(K, 5) }>: Sized,
1203    [(); 1 << K]:,
1204    [(); num_buckets(K)]:,
1205    [(); num_buckets(K) - 1]:,
1206{
1207}
1208#[cfg(feature = "alloc")]
1209impl<const K: u8> private::NotLastTable for Table<K, 6>
1210where
1211    EvaluatableUsize<{ metadata_size_bytes(K, 6) }>: Sized,
1212    [(); 1 << K]:,
1213    [(); num_buckets(K)]:,
1214    [(); num_buckets(K) - 1]:,
1215{
1216}
1217
1218#[cfg(feature = "alloc")]
1219impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1220where
1221    Self: private::SupportedOtherTables,
1222    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1223    [(); 1 << K]:,
1224    [(); num_buckets(K)]:,
1225    [(); num_buckets(K) - 1]:,
1226{
1227    /// Creates a new [`TABLE_NUMBER`] table. There also exists [`Self::create_parallel()`] that
1228    /// trades CPU efficiency and memory usage for lower latency and with multiple parallel calls,
1229    /// better overall performance.
1230    pub(super) fn create<const PARENT_TABLE_NUMBER: u8>(
1231        parent_table: Table<K, PARENT_TABLE_NUMBER>,
1232        cache: &TablesCache,
1233    ) -> (Self, PrunedTable<K, PARENT_TABLE_NUMBER>)
1234    where
1235        Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
1236        EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
1237    {
1238        let left_targets = &*cache.left_targets;
1239        let mut initialized_elements = 0_usize;
1240        // SAFETY: Contents is `MaybeUninit`
1241        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1242        // SAFETY: Contents is `MaybeUninit`
1243        let mut positions =
1244            unsafe { Box::<[MaybeUninit<[Position; 2]>; 1 << K]>::new_uninit().assume_init() };
1245        // The last table doesn't have metadata
1246        // SAFETY: Contents is `MaybeUninit`
1247        let mut metadatas = unsafe {
1248            Box::<[MaybeUninit<Metadata<K, TABLE_NUMBER>>; 1 << K]>::new_uninit().assume_init()
1249        };
1250
1251        for ([left_bucket, right_bucket], left_bucket_index) in
1252            parent_table.buckets().array_windows().zip(0..)
1253        {
1254            let mut matches = [MaybeUninit::uninit(); _];
1255            // SAFETY: Positions are taken from `Table::buckets()` and correspond to initialized
1256            // values
1257            let matches = unsafe {
1258                find_matches_in_buckets(
1259                    left_bucket_index,
1260                    left_bucket,
1261                    right_bucket,
1262                    &mut matches,
1263                    left_targets,
1264                )
1265            };
1266            // Throw away some successful matches that are not that necessary
1267            let matches = &matches[..matches.len().min(REDUCED_MATCHES_COUNT)];
1268            // SAFETY: Already initialized this many elements
1269            let (ys, positions, metadatas) = unsafe {
1270                (
1271                    ys.split_at_mut_unchecked(initialized_elements).1,
1272                    positions.split_at_mut_unchecked(initialized_elements).1,
1273                    metadatas.split_at_mut_unchecked(initialized_elements).1,
1274                )
1275            };
1276
1277            // SAFETY: Preallocated length is an upper bound and is always sufficient
1278            let (ys, positions, metadatas) = unsafe {
1279                (
1280                    ys.split_at_mut_unchecked(matches.len()).0,
1281                    positions.split_at_mut_unchecked(matches.len()).0,
1282                    metadatas.split_at_mut_unchecked(matches.len()).0,
1283                )
1284            };
1285
1286            // SAFETY: Matches come from the parent table and the size of `ys`, `positions`
1287            // and `metadatas` is the same as the number of matches
1288            unsafe {
1289                matches_to_results(&parent_table, matches, ys, positions, metadatas);
1290            }
1291
1292            initialized_elements += matches.len();
1293        }
1294
1295        let parent_table = parent_table.prune();
1296
1297        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1298        // layout, the number of elements matches the number of elements that were initialized
1299        let ys = unsafe {
1300            let ys_len = ys.len();
1301            let ys = Box::into_raw(ys);
1302            Vec::from_raw_parts(ys.cast(), initialized_elements, ys_len)
1303        };
1304
1305        // TODO: Try to group buckets in the process of collecting `y`s
1306        let buckets = group_by_buckets::<K>(&ys);
1307
1308        let table = Self::Other {
1309            positions,
1310            metadatas,
1311            buckets,
1312        };
1313
1314        (table, parent_table)
1315    }
1316
1317    /// Almost the same as [`Self::create()`], but uses parallelism internally for better
1318    /// performance (though not efficiency of CPU and memory usage), if you create multiple tables
1319    /// in parallel, prefer [`Self::create_parallel()`] for better overall performance.
1320    #[cfg(feature = "parallel")]
1321    pub(super) fn create_parallel<const PARENT_TABLE_NUMBER: u8>(
1322        parent_table: Table<K, PARENT_TABLE_NUMBER>,
1323        cache: &TablesCache,
1324    ) -> (Self, PrunedTable<K, PARENT_TABLE_NUMBER>)
1325    where
1326        Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
1327        EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
1328    {
1329        // SAFETY: Contents is `MaybeUninit`
1330        let ys = unsafe {
1331            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1332        };
1333        // SAFETY: Contents is `MaybeUninit`
1334        let positions = unsafe {
1335            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1336        };
1337        // SAFETY: Contents is `MaybeUninit`
1338        let metadatas = unsafe {
1339            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1340        };
1341        let global_results_counts =
1342            array::from_fn::<_, { num_buckets(K) - 1 }, _>(|_| SyncUnsafeCell::new(0u16));
1343
1344        let left_targets = &*cache.left_targets;
1345
1346        let buckets = parent_table.buckets();
1347        // Iterate over buckets in batches, such that a cache line worth of bytes is taken from
1348        // `global_results_counts` each time to avoid unnecessary false sharing
1349        let bucket_batch_size = CACHE_LINE_SIZE / size_of::<u16>();
1350        let bucket_batch_index = AtomicUsize::new(0);
1351
1352        rayon::broadcast(|_ctx| {
1353            loop {
1354                let bucket_batch_index = bucket_batch_index.fetch_add(1, Ordering::Relaxed);
1355
1356                let buckets_batch = buckets
1357                    .array_windows::<2>()
1358                    .enumerate()
1359                    .skip(bucket_batch_index * bucket_batch_size)
1360                    .take(bucket_batch_size);
1361
1362                if buckets_batch.is_empty() {
1363                    break;
1364                }
1365
1366                for (left_bucket_index, [left_bucket, right_bucket]) in buckets_batch {
1367                    let mut matches = [MaybeUninit::uninit(); _];
1368                    // SAFETY: Positions are taken from `Table::buckets()` and correspond to
1369                    // initialized values
1370                    let matches = unsafe {
1371                        find_matches_in_buckets(
1372                            left_bucket_index as u32,
1373                            left_bucket,
1374                            right_bucket,
1375                            &mut matches,
1376                            left_targets,
1377                        )
1378                    };
1379                    // Throw away some successful matches that are not that necessary
1380                    let matches = &matches[..matches.len().min(REDUCED_MATCHES_COUNT)];
1381
1382                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1383                    // at this time, and it is guaranteed to be in range
1384                    let ys = unsafe { &mut *ys.get_unchecked(left_bucket_index).get() };
1385                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1386                    // at this time, and it is guaranteed to be in range
1387                    let positions =
1388                        unsafe { &mut *positions.get_unchecked(left_bucket_index).get() };
1389                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1390                    // at this time, and it is guaranteed to be in range
1391                    let metadatas =
1392                        unsafe { &mut *metadatas.get_unchecked(left_bucket_index).get() };
1393                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1394                    // at this time, and it is guaranteed to be in range
1395                    let count = unsafe {
1396                        &mut *global_results_counts.get_unchecked(left_bucket_index).get()
1397                    };
1398
1399                    // SAFETY: Matches come from the parent table and the size of `ys`, `positions`
1400                    // and `metadatas` is larger or equal to the number of matches
1401                    unsafe {
1402                        matches_to_results::<_, TABLE_NUMBER, _>(
1403                            &parent_table,
1404                            matches,
1405                            ys,
1406                            positions,
1407                            metadatas,
1408                        )
1409                    };
1410                    *count = matches.len() as u16;
1411                }
1412            }
1413        });
1414
1415        let parent_table = parent_table.prune();
1416
1417        let ys = strip_sync_unsafe_cell(ys);
1418        let positions = strip_sync_unsafe_cell(positions);
1419        let metadatas = strip_sync_unsafe_cell(metadatas);
1420
1421        // TODO: Try to group buckets in the process of collecting `y`s
1422        // SAFETY: `global_results_counts` corresponds to the number of initialized `ys`
1423        let buckets = unsafe {
1424            group_by_buckets_from_buckets::<K, _>(
1425                ys.iter().zip(
1426                    global_results_counts
1427                        .into_iter()
1428                        .map(|count| usize::from(count.into_inner())),
1429                ),
1430            )
1431        };
1432
1433        let table = Self::OtherBuckets {
1434            positions,
1435            metadatas,
1436            buckets,
1437        };
1438
1439        (table, parent_table)
1440    }
1441
1442    /// Get `[left_position, right_position]` of a previous table for a specified position in a
1443    /// current table.
1444    ///
1445    /// # Safety
1446    /// `position` must come from [`Self::buckets()`] or [`Self::position()`] or
1447    /// [`PrunedTable::position()`] and not be a sentinel value.
1448    #[inline(always)]
1449    pub(super) unsafe fn position(&self, position: Position) -> [Position; 2] {
1450        match self {
1451            Self::First { .. } => {
1452                unreachable!("Not the first table");
1453            }
1454            Self::Other { positions, .. } => {
1455                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1456                unsafe { positions.get_unchecked(usize::from(position)).assume_init() }
1457            }
1458            #[cfg(feature = "parallel")]
1459            Self::OtherBuckets { positions, .. } => {
1460                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1461                unsafe {
1462                    positions
1463                        .as_flattened()
1464                        .get_unchecked(usize::from(position))
1465                        .assume_init()
1466                }
1467            }
1468        }
1469    }
1470}
1471
1472#[cfg(feature = "alloc")]
1473impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1474where
1475    Self: private::NotLastTable,
1476    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1477    [(); 1 << K]:,
1478    [(); num_buckets(K)]:,
1479    [(); num_buckets(K) - 1]:,
1480{
1481    /// Returns `None` for an invalid position or for table number 7.
1482    ///
1483    /// # Safety
1484    /// `position` must come from [`Self::buckets()`] and not be a sentinel value.
1485    #[inline(always)]
1486    unsafe fn metadata(&self, position: Position) -> Metadata<K, TABLE_NUMBER> {
1487        match self {
1488            Self::First { .. } => {
1489                // X matches position
1490                Metadata::from(X::from(u32::from(position)))
1491            }
1492            Self::Other { metadatas, .. } => {
1493                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1494                unsafe { metadatas.get_unchecked(usize::from(position)).assume_init() }
1495            }
1496            #[cfg(feature = "parallel")]
1497            Self::OtherBuckets { metadatas, .. } => {
1498                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1499                unsafe {
1500                    metadatas
1501                        .as_flattened()
1502                        .get_unchecked(usize::from(position))
1503                        .assume_init()
1504                }
1505            }
1506        }
1507    }
1508}
1509
1510#[cfg(feature = "alloc")]
1511impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1512where
1513    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1514    [(); 1 << K]:,
1515    [(); num_buckets(K)]:,
1516    [(); num_buckets(K) - 1]:,
1517{
1518    #[inline(always)]
1519    fn prune(self) -> PrunedTable<K, TABLE_NUMBER> {
1520        match self {
1521            Self::First { .. } => PrunedTable::First,
1522            Self::Other { positions, .. } => PrunedTable::Other { positions },
1523            #[cfg(feature = "parallel")]
1524            Self::OtherBuckets { positions, .. } => PrunedTable::OtherBuckets { positions },
1525        }
1526    }
1527
1528    /// Positions of `y`s grouped by the bucket they belong to
1529    #[inline(always)]
1530    pub(super) fn buckets(&self) -> &[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)] {
1531        match self {
1532            Self::First { buckets, .. } => buckets,
1533            Self::Other { buckets, .. } => buckets,
1534            #[cfg(feature = "parallel")]
1535            Self::OtherBuckets { buckets, .. } => buckets,
1536        }
1537    }
1538}