Skip to main content

subspace_proof_of_space/chiapos/
table.rs

1//! `chiapos` table implementation.
2
3#[cfg(any(feature = "alloc", test))]
4mod rmap;
5#[cfg(test)]
6mod tests;
7pub(super) mod types;
8
9use crate::chiapos::Seed;
10use crate::chiapos::constants::{PARAM_B, PARAM_BC, PARAM_C, PARAM_EXT, PARAM_M};
11#[cfg(feature = "alloc")]
12use crate::chiapos::table::rmap::Rmap;
13use crate::chiapos::table::types::{Metadata, X, Y};
14#[cfg(feature = "alloc")]
15use crate::chiapos::table::types::{Position, R};
16use crate::chiapos::utils::EvaluatableUsize;
17use ab_chacha8::{ChaCha8Block, ChaCha8State};
18#[cfg(feature = "alloc")]
19use alloc::boxed::Box;
20#[cfg(feature = "alloc")]
21use alloc::vec;
22#[cfg(feature = "alloc")]
23use alloc::vec::Vec;
24#[cfg(feature = "alloc")]
25use chacha20::cipher::{Iv, KeyIvInit, StreamCipher};
26#[cfg(feature = "alloc")]
27use chacha20::{ChaCha8, Key};
28use core::array;
29#[cfg(feature = "parallel")]
30use core::cell::SyncUnsafeCell;
31#[cfg(feature = "alloc")]
32use core::mem;
33#[cfg(feature = "alloc")]
34use core::mem::MaybeUninit;
35#[cfg(any(feature = "alloc", test))]
36use core::simd::prelude::*;
37#[cfg(feature = "parallel")]
38use core::sync::atomic::{AtomicUsize, Ordering};
39#[cfg(feature = "alloc")]
40use derive_more::Deref;
41#[cfg(any(feature = "alloc", test))]
42use seq_macro::seq;
43// TODO: Switch to `rclite` once https://github.com/fereidani/rclite/issues/11 is resolved
44#[cfg(feature = "alloc")]
45use alloc::sync::Arc;
46use subspace_core_primitives::hashes::blake3_hash;
47
48pub(super) const COMPUTE_F1_SIMD_FACTOR: usize = 8;
49#[cfg(any(feature = "alloc", test))]
50const COMPUTE_FN_SIMD_FACTOR: usize = 16;
51#[allow(dead_code, reason = "unused when crate is compiled separately")]
52const MAX_BUCKET_SIZE: usize = 512;
53#[cfg(feature = "alloc")]
54const BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS: u8 = 128;
55/// Bucket size used for fixed-size bucketed arrays.
56///
57/// Must be `MAX_BUCKET_SIZE` to avoid silently dropping entries when a bucket's Y values exceed
58/// the limit. With `PARAM_BC / 2^PARAM_EXT ≈ 236` expected entries per bucket and std dev ≈ 15.4,
59/// the original value of 272 (~2.3 sigma) caused overflow in production (K=20), dropping hundreds
60/// of proofs and causing "Missing PoS proof" farming errors.
61#[allow(dead_code, reason = "unused when crate is compiled separately")]
62pub(crate) const REDUCED_BUCKET_SIZE: usize = MAX_BUCKET_SIZE;
63/// Matches count limit per bucket pair.
64///
65/// Must be `MAX_BUCKET_SIZE` to avoid silently dropping matches. The original value of 288 could
66/// cause early termination in `find_matches_in_buckets`, losing valid proof paths.
67#[allow(dead_code, reason = "unused when crate is compiled separately")]
68const REDUCED_MATCHES_COUNT: usize = MAX_BUCKET_SIZE;
69#[cfg(feature = "parallel")]
70const CACHE_LINE_SIZE: usize = 64;
71
72const _: () = {
73    debug_assert!(REDUCED_BUCKET_SIZE <= MAX_BUCKET_SIZE);
74    debug_assert!(REDUCED_MATCHES_COUNT <= MAX_BUCKET_SIZE);
75};
76
77/// Compute the size of `y` in bits
78pub(super) const fn y_size_bits(k: u8) -> usize {
79    k as usize + PARAM_EXT as usize
80}
81
82/// Metadata size in bytes
83pub const fn metadata_size_bytes(k: u8, table_number: u8) -> usize {
84    metadata_size_bits(k, table_number).div_ceil(u8::BITS as usize)
85}
86
87/// Metadata size in bits
88pub(super) const fn metadata_size_bits(k: u8, table_number: u8) -> usize {
89    k as usize
90        * match table_number {
91            1 => 1,
92            2 => 2,
93            3 | 4 => 4,
94            5 => 3,
95            6 => 2,
96            7 => 0,
97            _ => unreachable!(),
98        }
99}
100
101/// Number of buckets for a given `k`
102pub const fn num_buckets(k: u8) -> usize {
103    2_usize
104        .pow(y_size_bits(k) as u32)
105        .div_ceil(PARAM_BC as usize)
106}
107
108/// Upper bound on entries in a non-first table: `num_buckets(K) - 1` bucket pairs, each yielding at
109/// most `REDUCED_MATCHES_COUNT` matches. The actual count is data-dependent and can exceed `2^K`.
110#[cfg(feature = "alloc")]
111const fn max_table_entries(k: u8) -> usize {
112    (num_buckets(k) - 1) * REDUCED_MATCHES_COUNT
113}
114
115#[cfg(feature = "parallel")]
116#[inline(always)]
117fn strip_sync_unsafe_cell<const N: usize, T>(value: Box<[SyncUnsafeCell<T>; N]>) -> Box<[T; N]> {
118    // SAFETY: `SyncUnsafeCell` has the same layout as `T`
119    unsafe { Box::from_raw(Box::into_raw(value).cast()) }
120}
121
122/// ChaCha8 [`Vec`] sufficient for the whole first table for [`K`].
123/// Prefer [`partial_y`] if you need partial y just for a single `x`.
124#[cfg(feature = "alloc")]
125fn partial_ys<const K: u8>(seed: Seed) -> Vec<u8> {
126    let output_len_bits = usize::from(K) * (1 << K);
127    let mut output = vec![0; output_len_bits.div_ceil(u8::BITS as usize)];
128
129    let key = Key::from(seed);
130    let iv = Iv::<ChaCha8>::default();
131
132    let mut cipher = ChaCha8::new(&key, &iv);
133
134    cipher.apply_keystream(&mut output);
135
136    output
137}
138
139/// Calculate a probabilistic upper bound on the Chia bucket size for a given `k` and
140/// `security_bits` (security level).
141///
142/// This is based on a Chernoff bound for the Poisson distribution with mean
143/// `lambda = PARAM_BC / 2^PARAM_EXT`, ensuring the probability that any bucket exceeds the bound is
144/// less than `2^{-security_bits}`.
145/// The bound is lambda + ceil(sqrt(3 * lambda * (k + security_bits) * ln(2))).
146#[cfg(feature = "alloc")]
147const fn bucket_size_upper_bound(k: u8, security_bits: u8) -> usize {
148    // Lambda is the expected number of entries in a bucket, approximated as
149    // `PARAM_BC / 2^PARAM_EXT`. It is independent of `k`.
150    const LAMBDA: u64 = PARAM_BC as u64 / 2u64.pow(PARAM_EXT as u32);
151    // Approximation of ln(2) as a fraction: ln(2) ≈ LN2_NUM / LN2_DEN.
152    // This allows integer-only computation of the square root term involving ln(2).
153    const LN2_NUM: u128 = 693147;
154    const LN2_DEN: u128 = 1000000;
155
156    // `k + security_bits` for the union bound over ~2^k intervals
157    let ks = k as u128 + security_bits as u128;
158    // Compute numerator for the expression under the square root:
159    // `3 * lambda * (k + security_bits) * LN2_NUM`
160    let num = 3u128 * LAMBDA as u128 * ks * LN2_NUM;
161    // Denominator for ln(2): `LN2_DEN`
162    let den = LN2_DEN;
163
164    let ceil_div: u128 = num.div_ceil(den);
165
166    // Binary search to find the smallest `x` such that `x * x * den >= num`,
167    // which computes `ceil(sqrt(num / den))` without floating-point.
168    // We use a custom binary search over `u64` range because binary search in the standard library
169    // operates on sorted slices, not directly on integer ranges for solving inequalities like this.
170    let mut low = 0u64;
171    let mut high = u64::MAX;
172    while low < high {
173        let mid = low + (high - low) / 2;
174        let left = (mid as u128) * (mid as u128);
175        if left >= ceil_div {
176            high = mid;
177        } else {
178            low = mid + 1;
179        }
180    }
181    let add_term = low;
182
183    (LAMBDA + add_term) as usize
184}
185
186#[cfg(feature = "alloc")]
187fn group_by_buckets<const K: u8>(
188    ys: &[Y],
189) -> Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>
190where
191    [(); num_buckets(K)]:,
192{
193    let mut bucket_offsets = [0_u16; num_buckets(K)];
194    // SAFETY: Contents is `MaybeUninit`
195    let mut buckets = unsafe {
196        Box::<[[MaybeUninit<(Position, Y)>; REDUCED_BUCKET_SIZE]; num_buckets(K)]>::new_uninit()
197            .assume_init()
198    };
199
200    for (&y, position) in ys.iter().zip(Position::ZERO..) {
201        let bucket_index = (u32::from(y) / u32::from(PARAM_BC)) as usize;
202
203        // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
204        let bucket_offset = unsafe { bucket_offsets.get_unchecked_mut(bucket_index) };
205        // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
206        let bucket = unsafe { buckets.get_unchecked_mut(bucket_index) };
207
208        if *bucket_offset < REDUCED_BUCKET_SIZE as u16 {
209            bucket[*bucket_offset as usize].write((position, y));
210            *bucket_offset += 1;
211        }
212    }
213
214    for (bucket, initialized) in buckets.iter_mut().zip(bucket_offsets) {
215        bucket[usize::from(initialized)..].write_filled((Position::SENTINEL, Y::SENTINEL));
216    }
217
218    // SAFETY: All entries are initialized
219    unsafe { Box::from_raw(Box::into_raw(buckets).cast()) }
220}
221
222/// Similar to [`group_by_buckets()`], but processes buckets instead of a flat list of `y`s.
223///
224/// # Safety
225/// Iterator item is a list of potentially uninitialized `y`s and a number of initialized `y`s. The
226/// number of initialized `y`s must be correct.
227#[cfg(feature = "parallel")]
228unsafe fn group_by_buckets_from_buckets<'a, const K: u8, I>(
229    iter: I,
230) -> Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>
231where
232    I: Iterator<Item = (&'a [MaybeUninit<Y>; REDUCED_MATCHES_COUNT], usize)> + 'a,
233    [(); num_buckets(K)]:,
234{
235    let mut bucket_offsets = [0_u16; num_buckets(K)];
236    // SAFETY: Contents is `MaybeUninit`
237    let mut buckets = unsafe {
238        Box::<[[MaybeUninit<(Position, Y)>; REDUCED_BUCKET_SIZE]; num_buckets(K)]>::new_uninit()
239            .assume_init()
240    };
241
242    for ((ys, count), batch_start) in iter.zip((Position::ZERO..).step_by(REDUCED_MATCHES_COUNT)) {
243        // SAFETY: Function contract guarantees that `y`s are initialized
244        let ys = unsafe { ys[..count].assume_init_ref() };
245        for (&y, position) in ys.iter().zip(batch_start..) {
246            let bucket_index = (u32::from(y) / u32::from(PARAM_BC)) as usize;
247
248            // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
249            let bucket_offset = unsafe { bucket_offsets.get_unchecked_mut(bucket_index) };
250            // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
251            let bucket = unsafe { buckets.get_unchecked_mut(bucket_index) };
252
253            if *bucket_offset < REDUCED_BUCKET_SIZE as u16 {
254                bucket[*bucket_offset as usize].write((position, y));
255                *bucket_offset += 1;
256            }
257        }
258    }
259
260    for (bucket, initialized) in buckets.iter_mut().zip(bucket_offsets) {
261        bucket[usize::from(initialized)..].write_filled((Position::SENTINEL, Y::SENTINEL));
262    }
263
264    // SAFETY: All entries are initialized
265    unsafe { Box::from_raw(Box::into_raw(buckets).cast()) }
266}
267
268/// Sort entries within each bucket by Y value to ensure deterministic proof ordering.
269/// This matches the old code's behavior where entries were globally Y-sorted before bucketing.
270#[cfg(feature = "alloc")]
271fn sort_buckets<const K: u8>(buckets: &mut [[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)])
272where
273    [(); num_buckets(K)]:,
274{
275    for bucket in buckets.iter_mut() {
276        let len = bucket
277            .iter()
278            .take_while(|&&(pos, _)| pos != Position::SENTINEL)
279            .count();
280        bucket[..len].sort_unstable_by_key(|&(pos, y)| (u32::from(y), u32::from(pos)));
281    }
282}
283
284#[cfg(feature = "alloc")]
285#[derive(Debug, Copy, Clone, Deref)]
286#[repr(align(64))]
287struct CacheLineAligned<T>(T);
288
289/// Mapping from `parity` to `r` to `m`
290#[cfg(feature = "alloc")]
291type LeftTargets = [[CacheLineAligned<[R; PARAM_M as usize]>; PARAM_BC as usize]; 2];
292
293#[cfg(feature = "alloc")]
294fn calculate_left_targets() -> Arc<LeftTargets> {
295    let mut left_targets = Arc::<LeftTargets>::new_uninit();
296    // SAFETY: Same layout and uninitialized in both cases
297    let left_targets_slice = unsafe {
298        mem::transmute::<
299            &mut MaybeUninit<[[CacheLineAligned<[R; PARAM_M as usize]>; PARAM_BC as usize]; 2]>,
300            &mut [[MaybeUninit<CacheLineAligned<[R; PARAM_M as usize]>>; PARAM_BC as usize]; 2],
301        >(Arc::get_mut_unchecked(&mut left_targets))
302    };
303
304    for parity in 0..=1 {
305        for r in 0..PARAM_BC {
306            let c = r / PARAM_C;
307
308            let mut arr = array::from_fn(|m| {
309                let m = m as u16;
310                R::from(
311                    ((c + m) % PARAM_B) * PARAM_C
312                        + (((2 * m + parity) * (2 * m + parity) + r) % PARAM_C),
313                )
314            });
315            arr.sort_unstable();
316            left_targets_slice[parity as usize][r as usize].write(CacheLineAligned(arr));
317        }
318    }
319
320    // SAFETY: Initialized all entries
321    unsafe { left_targets.assume_init() }
322}
323
324fn calculate_left_target_on_demand(parity: u32, r: u32, m: u32) -> u32 {
325    let param_b = u32::from(PARAM_B);
326    let param_c = u32::from(PARAM_C);
327
328    ((r / param_c + m) % param_b) * param_c + (((2 * m + parity) * (2 * m + parity) + r) % param_c)
329}
330
331/// Caches that can be used to optimize the creation of multiple [`Tables`](super::Tables).
332#[cfg(feature = "alloc")]
333#[derive(Debug, Clone)]
334pub struct TablesCache {
335    left_targets: Arc<LeftTargets>,
336}
337
338#[cfg(feature = "alloc")]
339impl Default for TablesCache {
340    /// Create a new instance
341    fn default() -> Self {
342        Self {
343            left_targets: calculate_left_targets(),
344        }
345    }
346}
347
348#[cfg(feature = "alloc")]
349#[derive(Debug, Copy, Clone)]
350struct Match {
351    left_position: Position,
352    left_y: Y,
353    right_position: Position,
354}
355
356/// `partial_y_offset` is in bits within `partial_y`
357pub(super) fn compute_f1<const K: u8>(x: X, seed: &Seed) -> Y {
358    let skip_bits = u32::from(K) * u32::from(x);
359    let skip_u32s = skip_bits / u32::BITS;
360    let partial_y_offset = skip_bits % u32::BITS;
361
362    const U32S_PER_BLOCK: usize = size_of::<ChaCha8Block>() / size_of::<u32>();
363
364    let initial_state = ChaCha8State::init(seed, &[0; _]);
365    let first_block_counter = skip_u32s / U32S_PER_BLOCK as u32;
366    let u32_in_first_block = skip_u32s as usize % U32S_PER_BLOCK;
367
368    let first_block = initial_state.compute_block(first_block_counter);
369    let hi = first_block[u32_in_first_block].to_be();
370
371    // TODO: Is SIMD version of `compute_block()` that produces two blocks at once possible?
372    let lo = if u32_in_first_block + 1 == U32S_PER_BLOCK {
373        // Spilled over into the second block
374        let second_block = initial_state.compute_block(first_block_counter + 1);
375        second_block[0].to_be()
376    } else {
377        first_block[u32_in_first_block + 1].to_be()
378    };
379
380    let partial_y = (u64::from(hi) << u32::BITS) | u64::from(lo);
381
382    let pre_y = partial_y >> (u64::BITS - u32::from(K + PARAM_EXT) - partial_y_offset);
383    let pre_y = pre_y as u32;
384    // Mask for clearing the rest of bits of `pre_y`.
385    let pre_y_mask = (u32::MAX << PARAM_EXT) & (u32::MAX >> (u32::BITS - u32::from(K + PARAM_EXT)));
386
387    // Extract `PARAM_EXT` most significant bits from `x` and store in the final offset of
388    // eventual `y` with the rest of bits being zero (`x` is `0..2^K`)
389    let pre_ext = u32::from(x) >> (K - PARAM_EXT);
390
391    // Combine all of the bits together:
392    // [padding zero bits][`K` bits rom `partial_y`][`PARAM_EXT` bits from `x`]
393    Y::from((pre_y & pre_y_mask) | pre_ext)
394}
395
396#[cfg(any(feature = "alloc", test))]
397pub(super) fn compute_f1_simd<const K: u8>(
398    xs: Simd<u32, COMPUTE_F1_SIMD_FACTOR>,
399    partial_ys: &[u8; K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize],
400) -> [Y; COMPUTE_F1_SIMD_FACTOR] {
401    // Each element contains `K` desired bits of `partial_ys` in the final offset of eventual `ys`
402    // with the rest of bits being in undefined state
403    let pre_ys_bytes = array::from_fn(|i| {
404        let partial_y_offset = i * usize::from(K);
405        let partial_y_length =
406            (partial_y_offset % u8::BITS as usize + usize::from(K)).div_ceil(u8::BITS as usize);
407        let mut pre_y_bytes = 0u64.to_be_bytes();
408        pre_y_bytes[..partial_y_length].copy_from_slice(
409            &partial_ys[partial_y_offset / u8::BITS as usize..][..partial_y_length],
410        );
411
412        u64::from_be_bytes(pre_y_bytes)
413    });
414    let pre_ys_right_offset = array::from_fn(|i| {
415        let partial_y_offset = i as u32 * u32::from(K);
416        u64::from(u64::BITS - u32::from(K + PARAM_EXT) - partial_y_offset % u8::BITS)
417    });
418    let pre_ys = Simd::from_array(pre_ys_bytes) >> Simd::from_array(pre_ys_right_offset);
419
420    // Mask for clearing the rest of bits of `pre_ys`.
421    let pre_ys_mask = Simd::splat(
422        (u32::MAX << usize::from(PARAM_EXT))
423            & (u32::MAX >> (u32::BITS as usize - usize::from(K + PARAM_EXT))),
424    );
425
426    // Extract `PARAM_EXT` most significant bits from `xs` and store in the final offset of
427    // eventual `ys` with the rest of bits being in undefined state.
428    let pre_exts = xs >> Simd::splat(u32::from(K - PARAM_EXT));
429
430    // Combine all of the bits together:
431    // [padding zero bits][`K` bits rom `partial_y`][`PARAM_EXT` bits from `x`]
432    let ys = (pre_ys.cast() & pre_ys_mask) | pre_exts;
433
434    Y::array_from_repr(ys.to_array())
435}
436
437/// For verification use [`has_match`] instead.
438///
439/// # Safety
440/// Left and right bucket positions must correspond to the parent table.
441// TODO: Try to reduce the `matches` size further by processing `left_bucket` in chunks (like halves
442//  for example)
443#[cfg(feature = "alloc")]
444unsafe fn find_matches_in_buckets<'a>(
445    left_bucket_index: u32,
446    left_bucket: &[(Position, Y); REDUCED_BUCKET_SIZE],
447    right_bucket: &[(Position, Y); REDUCED_BUCKET_SIZE],
448    // `PARAM_M as usize * 2` corresponds to the upper bound number of matches a single `y` in the
449    // left bucket might have here
450    matches: &'a mut [MaybeUninit<Match>; REDUCED_MATCHES_COUNT + PARAM_M as usize * 2],
451    left_targets: &LeftTargets,
452) -> &'a [Match] {
453    let left_base = left_bucket_index * u32::from(PARAM_BC);
454    let right_base = left_base + u32::from(PARAM_BC);
455
456    let mut rmap = Rmap::new();
457    for &(right_position, y) in right_bucket {
458        if right_position == Position::SENTINEL {
459            break;
460        }
461        let r = R::from((u32::from(y) - right_base) as u16);
462        // SAFETY: `r` is within `0..PARAM_BC` range by definition, the right bucket is limited to
463        // `REDUCED_BUCKETS_SIZE`
464        unsafe {
465            rmap.add(r, right_position);
466        }
467    }
468
469    let parity = left_base % 2;
470    let left_targets_parity = &left_targets[parity as usize];
471    let mut next_match_index = 0;
472
473    // TODO: Simd read for left bucket? It might be more efficient in terms of memory access to
474    //  process chunks of the left bucket against one right value for each at a time
475    for &(left_position, y) in left_bucket {
476        // `next_match_index >= REDUCED_MATCHES_COUNT` is crucial to make sure
477        if left_position == Position::SENTINEL || next_match_index >= REDUCED_MATCHES_COUNT {
478            // Sentinel values are padded to the end of the bucket
479            break;
480        }
481
482        let r = R::from((u32::from(y) - left_base) as u16);
483        // SAFETY: `r` is within a bucket and exists by definition
484        let left_targets_r = unsafe { left_targets_parity.get_unchecked(usize::from(r)) };
485
486        for &r_target in left_targets_r.iter() {
487            // SAFETY: Targets are always limited to `PARAM_BC`
488            let right_positions = unsafe { rmap.get(r_target) };
489
490            for &right_position in right_positions {
491                if next_match_index >= matches.len() {
492                    break;
493                }
494                // SAFETY: Bounds checked above
495                unsafe { matches.get_unchecked_mut(next_match_index) }.write(Match {
496                    left_position,
497                    left_y: y,
498                    right_position,
499                });
500                next_match_index += 1;
501            }
502        }
503    }
504
505    // SAFETY: Initialized this many matches
506    unsafe { matches[..next_match_index].assume_init_ref() }
507}
508
509/// Simplified version of [`find_matches_in_buckets`] for verification purposes.
510pub(super) fn has_match(left_y: Y, right_y: Y) -> bool {
511    let right_r = u32::from(right_y) % u32::from(PARAM_BC);
512    let parity = (u32::from(left_y) / u32::from(PARAM_BC)) % 2;
513    let left_r = u32::from(left_y) % u32::from(PARAM_BC);
514
515    let r_targets = array::from_fn::<_, { PARAM_M as usize }, _>(|i| {
516        calculate_left_target_on_demand(parity, left_r, i as u32)
517    });
518
519    r_targets.contains(&right_r)
520}
521
522#[inline(always)]
523pub(super) fn compute_fn<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
524    y: Y,
525    left_metadata: Metadata<K, PARENT_TABLE_NUMBER>,
526    right_metadata: Metadata<K, PARENT_TABLE_NUMBER>,
527) -> (Y, Metadata<K, TABLE_NUMBER>)
528where
529    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
530    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
531{
532    let left_metadata = u128::from(left_metadata);
533    let right_metadata = u128::from(right_metadata);
534
535    let parent_metadata_bits = metadata_size_bits(K, PARENT_TABLE_NUMBER);
536
537    // Part of the `right_bits` at the final offset of eventual `input_a`
538    let y_and_left_bits = y_size_bits(K) + parent_metadata_bits;
539    let right_bits_start_offset = u128::BITS as usize - parent_metadata_bits;
540
541    // Take only bytes where bits were set
542    let num_bytes_with_data =
543        (y_size_bits(K) + parent_metadata_bits * 2).div_ceil(u8::BITS as usize);
544
545    // Only supports `K` from 15 to 25 (otherwise math will not be correct when concatenating y,
546    // left metadata and right metadata)
547    let hash = {
548        // Collect `K` most significant bits of `y` at the final offset of eventual `input_a`
549        let y_bits = u128::from(y) << (u128::BITS as usize - y_size_bits(K));
550
551        // Move bits of `left_metadata` at the final offset of eventual `input_a`
552        let left_metadata_bits =
553            left_metadata << (u128::BITS as usize - parent_metadata_bits - y_size_bits(K));
554
555        // If `right_metadata` bits start to the left of the desired position in `input_a` move
556        // bits right, else move left
557        if right_bits_start_offset < y_and_left_bits {
558            let right_bits_pushed_into_input_b = y_and_left_bits - right_bits_start_offset;
559            // Collect bits of `right_metadata` that will fit into `input_a` at the final offset in
560            // eventual `input_a`
561            let right_bits_a = right_metadata >> right_bits_pushed_into_input_b;
562            let input_a = y_bits | left_metadata_bits | right_bits_a;
563            // Collect bits of `right_metadata` that will spill over into `input_b`
564            let input_b = right_metadata << (u128::BITS as usize - right_bits_pushed_into_input_b);
565
566            let input = [input_a.to_be_bytes(), input_b.to_be_bytes()];
567            let input_len =
568                size_of::<u128>() + right_bits_pushed_into_input_b.div_ceil(u8::BITS as usize);
569            blake3_hash(&input.as_flattened()[..input_len])
570        } else {
571            let right_bits_a = right_metadata << (right_bits_start_offset - y_and_left_bits);
572            let input_a = y_bits | left_metadata_bits | right_bits_a;
573
574            blake3_hash(&input_a.to_be_bytes()[..num_bytes_with_data])
575        }
576    };
577
578    let y_output = Y::from(
579        u32::from_be_bytes([hash[0], hash[1], hash[2], hash[3]])
580            >> (u32::BITS as usize - y_size_bits(K)),
581    );
582
583    let metadata_size_bits = metadata_size_bits(K, TABLE_NUMBER);
584
585    let metadata = if TABLE_NUMBER < 4 {
586        (left_metadata << parent_metadata_bits) | right_metadata
587    } else if metadata_size_bits > 0 {
588        // For K up to 25 it is guaranteed that metadata + bit offset will always fit into u128.
589        // We collect the bytes necessary, potentially with extra bits at the start and end of the
590        // bytes that will be taken care of later.
591        let metadata = u128::from_be_bytes(
592            hash[y_size_bits(K) / u8::BITS as usize..][..size_of::<u128>()]
593                .try_into()
594                .expect("Always enough bits for any K; qed"),
595        );
596        // Remove extra bits at the beginning
597        let metadata = metadata << (y_size_bits(K) % u8::BITS as usize);
598        // Move bits into the correct location
599        metadata >> (u128::BITS as usize - metadata_size_bits)
600    } else {
601        0
602    };
603
604    (y_output, Metadata::from(metadata))
605}
606
607// TODO: This is actually using only pipelining rather than real SIMD (at least explicitly) due to:
608//  * https://github.com/rust-lang/portable-simd/issues/108
609//  * https://github.com/BLAKE3-team/BLAKE3/issues/478#issuecomment-3200106103
610#[cfg(any(feature = "alloc", test))]
611fn compute_fn_simd<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
612    left_ys: [Y; COMPUTE_FN_SIMD_FACTOR],
613    left_metadatas: [Metadata<K, PARENT_TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
614    right_metadatas: [Metadata<K, PARENT_TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
615) -> (
616    [Y; COMPUTE_FN_SIMD_FACTOR],
617    [Metadata<K, TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
618)
619where
620    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
621    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
622{
623    let parent_metadata_bits = metadata_size_bits(K, PARENT_TABLE_NUMBER);
624    let metadata_size_bits = metadata_size_bits(K, TABLE_NUMBER);
625
626    // TODO: `u128` is not supported as SIMD element yet, see
627    //  https://github.com/rust-lang/portable-simd/issues/108
628    let left_metadatas: [u128; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
629        [
630        #(
631            u128::from(left_metadatas[N]),
632        )*
633        ]
634    });
635    let right_metadatas: [u128; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
636        [
637        #(
638            u128::from(right_metadatas[N]),
639        )*
640        ]
641    });
642
643    // Part of the `right_bits` at the final offset of eventual `input_a`
644    let y_and_left_bits = y_size_bits(K) + parent_metadata_bits;
645    let right_bits_start_offset = u128::BITS as usize - parent_metadata_bits;
646
647    // Take only bytes where bits were set
648    let num_bytes_with_data =
649        (y_size_bits(K) + parent_metadata_bits * 2).div_ceil(u8::BITS as usize);
650
651    // Only supports `K` from 15 to 25 (otherwise math will not be correct when concatenating y,
652    // left metadata and right metadata)
653    // TODO: SIMD hashing once this is possible:
654    //  https://github.com/BLAKE3-team/BLAKE3/issues/478#issuecomment-3200106103
655    let hashes: [_; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
656        [
657        #(
658        {
659            let y = left_ys[N];
660            let left_metadata = left_metadatas[N];
661            let right_metadata = right_metadatas[N];
662
663            // Collect `K` most significant bits of `y` at the final offset of eventual
664            // `input_a`
665            let y_bits = u128::from(y) << (u128::BITS as usize - y_size_bits(K));
666
667            // Move bits of `left_metadata` at the final offset of eventual `input_a`
668            let left_metadata_bits =
669                left_metadata << (u128::BITS as usize - parent_metadata_bits - y_size_bits(K));
670
671            // If `right_metadata` bits start to the left of the desired position in `input_a` move
672            // bits right, else move left
673            if right_bits_start_offset < y_and_left_bits {
674                let right_bits_pushed_into_input_b = y_and_left_bits - right_bits_start_offset;
675                // Collect bits of `right_metadata` that will fit into `input_a` at the final offset
676                // in eventual `input_a`
677                let right_bits_a = right_metadata >> right_bits_pushed_into_input_b;
678                let input_a = y_bits | left_metadata_bits | right_bits_a;
679                // Collect bits of `right_metadata` that will spill over into `input_b`
680                let input_b = right_metadata << (u128::BITS as usize - right_bits_pushed_into_input_b);
681
682                let input = [input_a.to_be_bytes(), input_b.to_be_bytes()];
683                let input_len =
684                    size_of::<u128>() + right_bits_pushed_into_input_b.div_ceil(u8::BITS as usize);
685                blake3_hash(&input.as_flattened()[..input_len])
686            } else {
687                let right_bits_a = right_metadata << (right_bits_start_offset - y_and_left_bits);
688                let input_a = y_bits | left_metadata_bits | right_bits_a;
689
690                blake3_hash(&input_a.to_be_bytes()[..num_bytes_with_data])
691            }
692        },
693        )*
694        ]
695    });
696
697    let y_outputs = Simd::from_array(
698        hashes.map(|hash| u32::from_be_bytes([hash[0], hash[1], hash[2], hash[3]])),
699    ) >> (u32::BITS - y_size_bits(K) as u32);
700    let y_outputs = Y::array_from_repr(y_outputs.to_array());
701
702    let metadatas = if TABLE_NUMBER < 4 {
703        seq!(N in 0..16 {
704            [
705            #(
706                Metadata::from((left_metadatas[N] << parent_metadata_bits) | right_metadatas[N]),
707            )*
708            ]
709        })
710    } else if metadata_size_bits > 0 {
711        // For K up to 25 it is guaranteed that metadata + bit offset will always fit into u128.
712        // We collect the bytes necessary, potentially with extra bits at the start and end of the
713        // bytes that will be taken care of later.
714        seq!(N in 0..16 {
715            [
716            #(
717            {
718                let metadata = u128::from_be_bytes(
719                    hashes[N][y_size_bits(K) / u8::BITS as usize..][..size_of::<u128>()]
720                        .try_into()
721                        .expect("Always enough bits for any K; qed"),
722                );
723                // Remove extra bits at the beginning
724                let metadata = metadata << (y_size_bits(K) % u8::BITS as usize);
725                // Move bits into the correct location
726                Metadata::from(metadata >> (u128::BITS as usize - metadata_size_bits))
727            },
728            )*
729            ]
730        })
731    } else {
732        [Metadata::default(); _]
733    };
734
735    (y_outputs, metadatas)
736}
737
738/// # Safety
739/// `m` must contain positions that correspond to the parent table
740#[cfg(feature = "alloc")]
741unsafe fn match_to_result<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
742    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
743    m: &Match,
744) -> (Y, [Position; 2], Metadata<K, TABLE_NUMBER>)
745where
746    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
747    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
748    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
749    [(); 1 << K]:,
750    [(); num_buckets(K)]:,
751    [(); num_buckets(K) - 1]:,
752{
753    // SAFETY: Guaranteed by function contract
754    let left_metadata = unsafe { parent_table.metadata(m.left_position) };
755    // SAFETY: Guaranteed by function contract
756    let right_metadata = unsafe { parent_table.metadata(m.right_position) };
757
758    let (y, metadata) =
759        compute_fn::<K, TABLE_NUMBER, PARENT_TABLE_NUMBER>(m.left_y, left_metadata, right_metadata);
760
761    (y, [m.left_position, m.right_position], metadata)
762}
763
764/// # Safety
765/// `matches` must contain positions that correspond to the parent table
766#[cfg(feature = "alloc")]
767#[inline(always)]
768unsafe fn match_to_result_simd<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
769    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
770    matches: &[Match; COMPUTE_FN_SIMD_FACTOR],
771) -> (
772    [Y; COMPUTE_FN_SIMD_FACTOR],
773    [[Position; 2]; COMPUTE_FN_SIMD_FACTOR],
774    [Metadata<K, TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
775)
776where
777    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
778    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
779    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
780    [(); 1 << K]:,
781    [(); num_buckets(K)]:,
782    [(); num_buckets(K) - 1]:,
783{
784    let left_ys: [_; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
785        [
786        #(
787            matches[N].left_y,
788        )*
789        ]
790    });
791    // SAFETY: Guaranteed by function contract
792    let left_metadatas: [_; COMPUTE_FN_SIMD_FACTOR] = unsafe {
793        seq!(N in 0..16 {
794            [
795            #(
796                parent_table.metadata(matches[N].left_position),
797            )*
798            ]
799        })
800    };
801    // SAFETY: Guaranteed by function contract
802    let right_metadatas: [_; COMPUTE_FN_SIMD_FACTOR] = unsafe {
803        seq!(N in 0..16 {
804            [
805            #(
806                parent_table.metadata(matches[N].right_position),
807            )*
808            ]
809        })
810    };
811
812    let (y_outputs, metadatas) = compute_fn_simd::<K, TABLE_NUMBER, PARENT_TABLE_NUMBER>(
813        left_ys,
814        left_metadatas,
815        right_metadatas,
816    );
817
818    let positions = seq!(N in 0..16 {
819        [
820        #(
821            [
822                matches[N].left_position,
823                matches[N].right_position,
824            ],
825        )*
826        ]
827    });
828
829    (y_outputs, positions, metadatas)
830}
831
832/// # Safety
833/// `matches` must contain positions that correspond to the parent table. `ys`, `position` and
834/// `metadatas` length must be at least the length of `matches`
835#[cfg(feature = "alloc")]
836#[inline(always)]
837unsafe fn matches_to_results<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
838    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
839    matches: &[Match],
840    ys: &mut [MaybeUninit<Y>],
841    positions: &mut [MaybeUninit<[Position; 2]>],
842    metadatas: &mut [MaybeUninit<Metadata<K, TABLE_NUMBER>>],
843) where
844    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
845    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
846    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
847    [(); 1 << K]:,
848    [(); num_buckets(K)]:,
849    [(); num_buckets(K) - 1]:,
850{
851    let (grouped_matches, other_matches) = matches.as_chunks::<COMPUTE_FN_SIMD_FACTOR>();
852    let (grouped_ys, other_ys) = ys.split_at_mut(grouped_matches.as_flattened().len());
853    let grouped_ys = grouped_ys.as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>().0;
854    let (grouped_positions, other_positions) =
855        positions.split_at_mut(grouped_matches.as_flattened().len());
856    let grouped_positions = grouped_positions
857        .as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>()
858        .0;
859    let (grouped_metadatas, other_metadatas) =
860        metadatas.split_at_mut(grouped_matches.as_flattened().len());
861    let grouped_metadatas = grouped_metadatas
862        .as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>()
863        .0;
864
865    for (((grouped_matches, grouped_ys), grouped_positions), grouped_metadatas) in grouped_matches
866        .iter()
867        .zip(grouped_ys)
868        .zip(grouped_positions)
869        .zip(grouped_metadatas)
870    {
871        // SAFETY: Guaranteed by function contract
872        let (ys_group, positions_group, metadatas_group) =
873            unsafe { match_to_result_simd(parent_table, grouped_matches) };
874        grouped_ys.write_copy_of_slice(&ys_group);
875        grouped_positions.write_copy_of_slice(&positions_group);
876
877        // The last table doesn't have metadata
878        if metadata_size_bits(K, TABLE_NUMBER) > 0 {
879            grouped_metadatas.write_copy_of_slice(&metadatas_group);
880        }
881    }
882    for (((other_match, other_y), other_positions), other_metadata) in other_matches
883        .iter()
884        .zip(other_ys)
885        .zip(other_positions)
886        .zip(other_metadatas)
887    {
888        // SAFETY: Guaranteed by function contract
889        let (y, p, metadata) = unsafe { match_to_result(parent_table, other_match) };
890        other_y.write(y);
891        other_positions.write(p);
892        // The last table doesn't have metadata
893        if metadata_size_bits(K, TABLE_NUMBER) > 0 {
894            other_metadata.write(metadata);
895        }
896    }
897}
898
899/// Similar to [`Table`], but smaller size for later processing stages
900#[cfg(feature = "alloc")]
901#[derive(Debug)]
902pub(super) enum PrunedTable<const K: u8, const TABLE_NUMBER: u8>
903where
904    [(); 1 << K]:,
905    [(); num_buckets(K) - 1]:,
906{
907    First,
908    /// Other tables
909    Other {
910        /// Left and right entry positions in a previous table encoded into bits
911        positions: Box<[MaybeUninit<[Position; 2]>]>,
912    },
913    /// Other tables
914    #[cfg(feature = "parallel")]
915    OtherBuckets {
916        /// Left and right entry positions in a previous table encoded into bits.
917        ///
918        /// Only positions from the `buckets` field are guaranteed to be initialized.
919        positions: Box<[[MaybeUninit<[Position; 2]>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1]>,
920    },
921}
922
923#[cfg(feature = "alloc")]
924impl<const K: u8, const TABLE_NUMBER: u8> PrunedTable<K, TABLE_NUMBER>
925where
926    [(); 1 << K]:,
927    [(); num_buckets(K) - 1]:,
928{
929    /// Get `[left_position, right_position]` of a previous table for a specified position in a
930    /// current table.
931    ///
932    /// # Safety
933    /// `position` must come from [`Table::buckets()`] or [`Self::position()`] and not be a sentinel
934    /// value.
935    #[inline(always)]
936    pub(super) unsafe fn position(&self, position: Position) -> [Position; 2] {
937        match self {
938            Self::First => {
939                unreachable!("Not the first table");
940            }
941            Self::Other { positions, .. } => {
942                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
943                unsafe { positions.get_unchecked(usize::from(position)).assume_init() }
944            }
945            #[cfg(feature = "parallel")]
946            Self::OtherBuckets { positions, .. } => {
947                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
948                unsafe {
949                    positions
950                        .as_flattened()
951                        .get_unchecked(usize::from(position))
952                        .assume_init()
953                }
954            }
955        }
956    }
957}
958
959#[cfg(feature = "alloc")]
960#[derive(Debug)]
961pub(super) enum Table<const K: u8, const TABLE_NUMBER: u8>
962where
963    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
964    [(); 1 << K]:,
965    [(); num_buckets(K)]:,
966    [(); num_buckets(K) - 1]:,
967{
968    /// First table
969    First {
970        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
971        ///
972        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
973        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
974    },
975    /// Other tables
976    Other {
977        /// Left and right entry positions in a previous table encoded into bits
978        positions: Box<[MaybeUninit<[Position; 2]>]>,
979        /// Metadata corresponding to each entry
980        metadatas: Box<[MaybeUninit<Metadata<K, TABLE_NUMBER>>]>,
981        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
982        ///
983        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
984        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
985    },
986    /// Other tables
987    #[cfg(feature = "parallel")]
988    OtherBuckets {
989        /// Left and right entry positions in a previous table encoded into bits.
990        ///
991        /// Only positions from the `buckets` field are guaranteed to be initialized.
992        positions: Box<[[MaybeUninit<[Position; 2]>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1]>,
993        /// Metadata corresponding to each entry.
994        ///
995        /// Only positions from the `buckets` field are guaranteed to be initialized.
996        metadatas: Box<
997            [[MaybeUninit<Metadata<K, TABLE_NUMBER>>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1],
998        >,
999        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
1000        ///
1001        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
1002        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
1003    },
1004}
1005
1006#[cfg(feature = "alloc")]
1007impl<const K: u8> Table<K, 1>
1008where
1009    EvaluatableUsize<{ metadata_size_bytes(K, 1) }>: Sized,
1010    [(); 1 << K]:,
1011    [(); num_buckets(K)]:,
1012    [(); num_buckets(K) - 1]:,
1013{
1014    /// Create the table
1015    pub(super) fn create(seed: Seed) -> Self
1016    where
1017        EvaluatableUsize<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>: Sized,
1018    {
1019        // `MAX_BUCKET_SIZE` is not actively used, but is an upper-bound reference for the other
1020        // parameters
1021        debug_assert!(
1022            MAX_BUCKET_SIZE >= bucket_size_upper_bound(K, BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS),
1023            "Max bucket size is not sufficiently large"
1024        );
1025
1026        let partial_ys = partial_ys::<K>(seed);
1027
1028        // SAFETY: Contents is `MaybeUninit`
1029        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1030
1031        for ((ys, xs_batch_start), partial_ys) in ys
1032            .as_chunks_mut::<COMPUTE_F1_SIMD_FACTOR>()
1033            .0
1034            .iter_mut()
1035            .zip((X::ZERO..).step_by(COMPUTE_F1_SIMD_FACTOR))
1036            .zip(
1037                partial_ys
1038                    .as_chunks::<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>()
1039                    .0,
1040            )
1041        {
1042            let xs = Simd::splat(u32::from(xs_batch_start))
1043                + Simd::from_array(array::from_fn(|i| i as u32));
1044            let ys_batch = compute_f1_simd::<K>(xs, partial_ys);
1045
1046            ys.write_copy_of_slice(&ys_batch);
1047        }
1048
1049        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1050        // layout, all elements were initialized
1051        let ys = unsafe {
1052            let ys_len = ys.len();
1053            let ys = Box::into_raw(ys);
1054            Vec::from_raw_parts(ys.cast(), ys_len, ys_len)
1055        };
1056
1057        // TODO: Try to group buckets in the process of collecting `y`s
1058        let mut buckets = group_by_buckets::<K>(&ys);
1059        sort_buckets::<K>(&mut buckets);
1060
1061        Self::First { buckets }
1062    }
1063
1064    /// Create the table, leverages available parallelism
1065    #[cfg(feature = "parallel")]
1066    pub(super) fn create_parallel(seed: Seed) -> Self
1067    where
1068        EvaluatableUsize<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>: Sized,
1069    {
1070        // `MAX_BUCKET_SIZE` is not actively used, but is an upper-bound reference for the other
1071        // parameters
1072        debug_assert!(
1073            MAX_BUCKET_SIZE >= bucket_size_upper_bound(K, BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS),
1074            "Max bucket size is not sufficiently large"
1075        );
1076
1077        let partial_ys = partial_ys::<K>(seed);
1078
1079        // SAFETY: Contents is `MaybeUninit`
1080        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1081
1082        // TODO: Try parallelism here?
1083        for ((ys, xs_batch_start), partial_ys) in ys
1084            .as_chunks_mut::<COMPUTE_F1_SIMD_FACTOR>()
1085            .0
1086            .iter_mut()
1087            .zip((X::ZERO..).step_by(COMPUTE_F1_SIMD_FACTOR))
1088            .zip(
1089                partial_ys
1090                    .as_chunks::<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>()
1091                    .0,
1092            )
1093        {
1094            let xs = Simd::splat(u32::from(xs_batch_start))
1095                + Simd::from_array(array::from_fn(|i| i as u32));
1096            let ys_batch = compute_f1_simd::<K>(xs, partial_ys);
1097
1098            ys.write_copy_of_slice(&ys_batch);
1099        }
1100
1101        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1102        // layout, all elements were initialized
1103        let ys = unsafe {
1104            let ys_len = ys.len();
1105            let ys = Box::into_raw(ys);
1106            Vec::from_raw_parts(ys.cast(), ys_len, ys_len)
1107        };
1108
1109        // TODO: Try to group buckets in the process of collecting `y`s
1110        let mut buckets = group_by_buckets::<K>(&ys);
1111        sort_buckets::<K>(&mut buckets);
1112
1113        Self::First { buckets }
1114    }
1115}
1116
1117#[cfg(feature = "alloc")]
1118mod private {
1119    pub(in super::super) trait SupportedOtherTables {}
1120    pub(in super::super) trait NotLastTable {}
1121}
1122
1123#[cfg(feature = "alloc")]
1124impl<const K: u8> private::SupportedOtherTables for Table<K, 2>
1125where
1126    EvaluatableUsize<{ metadata_size_bytes(K, 2) }>: Sized,
1127    [(); 1 << K]:,
1128    [(); num_buckets(K)]:,
1129    [(); num_buckets(K) - 1]:,
1130{
1131}
1132#[cfg(feature = "alloc")]
1133impl<const K: u8> private::SupportedOtherTables for Table<K, 3>
1134where
1135    EvaluatableUsize<{ metadata_size_bytes(K, 3) }>: Sized,
1136    [(); 1 << K]:,
1137    [(); num_buckets(K)]:,
1138    [(); num_buckets(K) - 1]:,
1139{
1140}
1141#[cfg(feature = "alloc")]
1142impl<const K: u8> private::SupportedOtherTables for Table<K, 4>
1143where
1144    EvaluatableUsize<{ metadata_size_bytes(K, 4) }>: Sized,
1145    [(); 1 << K]:,
1146    [(); num_buckets(K)]:,
1147    [(); num_buckets(K) - 1]:,
1148{
1149}
1150#[cfg(feature = "alloc")]
1151impl<const K: u8> private::SupportedOtherTables for Table<K, 5>
1152where
1153    EvaluatableUsize<{ metadata_size_bytes(K, 5) }>: Sized,
1154    [(); 1 << K]:,
1155    [(); num_buckets(K)]:,
1156    [(); num_buckets(K) - 1]:,
1157{
1158}
1159#[cfg(feature = "alloc")]
1160impl<const K: u8> private::SupportedOtherTables for Table<K, 6>
1161where
1162    EvaluatableUsize<{ metadata_size_bytes(K, 6) }>: Sized,
1163    [(); 1 << K]:,
1164    [(); num_buckets(K)]:,
1165    [(); num_buckets(K) - 1]:,
1166{
1167}
1168#[cfg(feature = "alloc")]
1169impl<const K: u8> private::SupportedOtherTables for Table<K, 7>
1170where
1171    EvaluatableUsize<{ metadata_size_bytes(K, 7) }>: Sized,
1172    [(); 1 << K]:,
1173    [(); num_buckets(K)]:,
1174    [(); num_buckets(K) - 1]:,
1175{
1176}
1177
1178#[cfg(feature = "alloc")]
1179impl<const K: u8> private::NotLastTable for Table<K, 1>
1180where
1181    EvaluatableUsize<{ metadata_size_bytes(K, 1) }>: Sized,
1182    [(); 1 << K]:,
1183    [(); num_buckets(K)]:,
1184    [(); num_buckets(K) - 1]:,
1185{
1186}
1187#[cfg(feature = "alloc")]
1188impl<const K: u8> private::NotLastTable for Table<K, 2>
1189where
1190    EvaluatableUsize<{ metadata_size_bytes(K, 2) }>: Sized,
1191    [(); 1 << K]:,
1192    [(); num_buckets(K)]:,
1193    [(); num_buckets(K) - 1]:,
1194{
1195}
1196#[cfg(feature = "alloc")]
1197impl<const K: u8> private::NotLastTable for Table<K, 3>
1198where
1199    EvaluatableUsize<{ metadata_size_bytes(K, 3) }>: Sized,
1200    [(); 1 << K]:,
1201    [(); num_buckets(K)]:,
1202    [(); num_buckets(K) - 1]:,
1203{
1204}
1205#[cfg(feature = "alloc")]
1206impl<const K: u8> private::NotLastTable for Table<K, 4>
1207where
1208    EvaluatableUsize<{ metadata_size_bytes(K, 4) }>: Sized,
1209    [(); 1 << K]:,
1210    [(); num_buckets(K)]:,
1211    [(); num_buckets(K) - 1]:,
1212{
1213}
1214#[cfg(feature = "alloc")]
1215impl<const K: u8> private::NotLastTable for Table<K, 5>
1216where
1217    EvaluatableUsize<{ metadata_size_bytes(K, 5) }>: Sized,
1218    [(); 1 << K]:,
1219    [(); num_buckets(K)]:,
1220    [(); num_buckets(K) - 1]:,
1221{
1222}
1223#[cfg(feature = "alloc")]
1224impl<const K: u8> private::NotLastTable for Table<K, 6>
1225where
1226    EvaluatableUsize<{ metadata_size_bytes(K, 6) }>: Sized,
1227    [(); 1 << K]:,
1228    [(); num_buckets(K)]:,
1229    [(); num_buckets(K) - 1]:,
1230{
1231}
1232
1233#[cfg(feature = "alloc")]
1234impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1235where
1236    Self: private::SupportedOtherTables,
1237    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1238    [(); 1 << K]:,
1239    [(); num_buckets(K)]:,
1240    [(); num_buckets(K) - 1]:,
1241{
1242    /// Creates a new [`TABLE_NUMBER`] table. There also exists [`Self::create_parallel()`] that
1243    /// trades CPU efficiency and memory usage for lower latency and with multiple parallel calls,
1244    /// better overall performance.
1245    pub(super) fn create<const PARENT_TABLE_NUMBER: u8>(
1246        parent_table: Table<K, PARENT_TABLE_NUMBER>,
1247        cache: &TablesCache,
1248    ) -> (Self, PrunedTable<K, PARENT_TABLE_NUMBER>)
1249    where
1250        Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
1251        EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
1252    {
1253        let left_targets = &*cache.left_targets;
1254        let mut initialized_elements = 0_usize;
1255        let mut ys: Box<[MaybeUninit<Y>]> = Box::new_uninit_slice(max_table_entries(K));
1256        let mut positions: Box<[MaybeUninit<[Position; 2]>]> =
1257            Box::new_uninit_slice(max_table_entries(K));
1258        // The last table doesn't have metadata
1259        let mut metadatas: Box<[MaybeUninit<Metadata<K, TABLE_NUMBER>>]> =
1260            Box::new_uninit_slice(max_table_entries(K));
1261
1262        for ([left_bucket, right_bucket], left_bucket_index) in
1263            parent_table.buckets().array_windows().zip(0..)
1264        {
1265            let mut matches = [MaybeUninit::uninit(); _];
1266            // SAFETY: Positions are taken from `Table::buckets()` and correspond to initialized
1267            // values
1268            let matches = unsafe {
1269                find_matches_in_buckets(
1270                    left_bucket_index,
1271                    left_bucket,
1272                    right_bucket,
1273                    &mut matches,
1274                    left_targets,
1275                )
1276            };
1277            // Throw away some successful matches that are not that necessary
1278            let matches = &matches[..matches.len().min(REDUCED_MATCHES_COUNT)];
1279            // SAFETY: Already initialized this many elements
1280            let (ys, positions, metadatas) = unsafe {
1281                (
1282                    ys.split_at_mut_unchecked(initialized_elements).1,
1283                    positions.split_at_mut_unchecked(initialized_elements).1,
1284                    metadatas.split_at_mut_unchecked(initialized_elements).1,
1285                )
1286            };
1287
1288            // SAFETY: arrays are sized to `max_table_entries(K)`, the exact upper bound on the total
1289            // number of matches, so `initialized_elements + matches.len()` never exceeds the length
1290            let (ys, positions, metadatas) = unsafe {
1291                (
1292                    ys.split_at_mut_unchecked(matches.len()).0,
1293                    positions.split_at_mut_unchecked(matches.len()).0,
1294                    metadatas.split_at_mut_unchecked(matches.len()).0,
1295                )
1296            };
1297
1298            // SAFETY: Matches come from the parent table and the size of `ys`, `positions`
1299            // and `metadatas` is the same as the number of matches
1300            unsafe {
1301                matches_to_results(&parent_table, matches, ys, positions, metadatas);
1302            }
1303
1304            initialized_elements += matches.len();
1305        }
1306
1307        let parent_table = parent_table.prune();
1308
1309        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1310        // layout, the number of elements matches the number of elements that were initialized
1311        let ys = unsafe {
1312            let ys_len = ys.len();
1313            let ys = Box::into_raw(ys);
1314            Vec::from_raw_parts(ys.cast(), initialized_elements, ys_len)
1315        };
1316
1317        // TODO: Try to group buckets in the process of collecting `y`s
1318        let mut buckets = group_by_buckets::<K>(&ys);
1319        sort_buckets::<K>(&mut buckets);
1320
1321        let table = Self::Other {
1322            positions,
1323            metadatas,
1324            buckets,
1325        };
1326
1327        (table, parent_table)
1328    }
1329
1330    /// Almost the same as [`Self::create()`], but uses parallelism internally for better
1331    /// performance (though not efficiency of CPU and memory usage), if you create multiple tables
1332    /// in parallel, prefer [`Self::create_parallel()`] for better overall performance.
1333    #[cfg(feature = "parallel")]
1334    pub(super) fn create_parallel<const PARENT_TABLE_NUMBER: u8>(
1335        parent_table: Table<K, PARENT_TABLE_NUMBER>,
1336        cache: &TablesCache,
1337    ) -> (Self, PrunedTable<K, PARENT_TABLE_NUMBER>)
1338    where
1339        Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
1340        EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
1341    {
1342        // SAFETY: Contents is `MaybeUninit`
1343        let ys = unsafe {
1344            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1345        };
1346        // SAFETY: Contents is `MaybeUninit`
1347        let positions = unsafe {
1348            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1349        };
1350        // SAFETY: Contents is `MaybeUninit`
1351        let metadatas = unsafe {
1352            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1353        };
1354        let global_results_counts =
1355            array::from_fn::<_, { num_buckets(K) - 1 }, _>(|_| SyncUnsafeCell::new(0u16));
1356
1357        let left_targets = &*cache.left_targets;
1358
1359        let buckets = parent_table.buckets();
1360        // Iterate over buckets in batches, such that a cache line worth of bytes is taken from
1361        // `global_results_counts` each time to avoid unnecessary false sharing
1362        let bucket_batch_size = CACHE_LINE_SIZE / size_of::<u16>();
1363        let bucket_batch_index = AtomicUsize::new(0);
1364
1365        rayon::broadcast(|_ctx| {
1366            loop {
1367                let bucket_batch_index = bucket_batch_index.fetch_add(1, Ordering::Relaxed);
1368
1369                let buckets_batch = buckets
1370                    .array_windows::<2>()
1371                    .enumerate()
1372                    .skip(bucket_batch_index * bucket_batch_size)
1373                    .take(bucket_batch_size);
1374
1375                if buckets_batch.is_empty() {
1376                    break;
1377                }
1378
1379                for (left_bucket_index, [left_bucket, right_bucket]) in buckets_batch {
1380                    let mut matches = [MaybeUninit::uninit(); _];
1381                    // SAFETY: Positions are taken from `Table::buckets()` and correspond to
1382                    // initialized values
1383                    let matches = unsafe {
1384                        find_matches_in_buckets(
1385                            left_bucket_index as u32,
1386                            left_bucket,
1387                            right_bucket,
1388                            &mut matches,
1389                            left_targets,
1390                        )
1391                    };
1392                    // Throw away some successful matches that are not that necessary
1393                    let matches = &matches[..matches.len().min(REDUCED_MATCHES_COUNT)];
1394
1395                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1396                    // at this time, and it is guaranteed to be in range
1397                    let ys = unsafe { &mut *ys.get_unchecked(left_bucket_index).get() };
1398                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1399                    // at this time, and it is guaranteed to be in range
1400                    let positions =
1401                        unsafe { &mut *positions.get_unchecked(left_bucket_index).get() };
1402                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1403                    // at this time, and it is guaranteed to be in range
1404                    let metadatas =
1405                        unsafe { &mut *metadatas.get_unchecked(left_bucket_index).get() };
1406                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1407                    // at this time, and it is guaranteed to be in range
1408                    let count = unsafe {
1409                        &mut *global_results_counts.get_unchecked(left_bucket_index).get()
1410                    };
1411
1412                    // SAFETY: Matches come from the parent table and the size of `ys`, `positions`
1413                    // and `metadatas` is larger or equal to the number of matches
1414                    unsafe {
1415                        matches_to_results::<_, TABLE_NUMBER, _>(
1416                            &parent_table,
1417                            matches,
1418                            ys,
1419                            positions,
1420                            metadatas,
1421                        )
1422                    };
1423                    *count = matches.len() as u16;
1424                }
1425            }
1426        });
1427
1428        let parent_table = parent_table.prune();
1429
1430        let ys = strip_sync_unsafe_cell(ys);
1431        let positions = strip_sync_unsafe_cell(positions);
1432        let metadatas = strip_sync_unsafe_cell(metadatas);
1433
1434        // TODO: Try to group buckets in the process of collecting `y`s
1435        // SAFETY: `global_results_counts` corresponds to the number of initialized `ys`
1436        let mut buckets = unsafe {
1437            group_by_buckets_from_buckets::<K, _>(
1438                ys.iter().zip(
1439                    global_results_counts
1440                        .into_iter()
1441                        .map(|count| usize::from(count.into_inner())),
1442                ),
1443            )
1444        };
1445        sort_buckets::<K>(&mut buckets);
1446
1447        let table = Self::OtherBuckets {
1448            positions,
1449            metadatas,
1450            buckets,
1451        };
1452
1453        (table, parent_table)
1454    }
1455
1456    /// Get `[left_position, right_position]` of a previous table for a specified position in a
1457    /// current table.
1458    ///
1459    /// # Safety
1460    /// `position` must come from [`Self::buckets()`] or [`Self::position()`] or
1461    /// [`PrunedTable::position()`] and not be a sentinel value.
1462    #[inline(always)]
1463    pub(super) unsafe fn position(&self, position: Position) -> [Position; 2] {
1464        match self {
1465            Self::First { .. } => {
1466                unreachable!("Not the first table");
1467            }
1468            Self::Other { positions, .. } => {
1469                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1470                unsafe { positions.get_unchecked(usize::from(position)).assume_init() }
1471            }
1472            #[cfg(feature = "parallel")]
1473            Self::OtherBuckets { positions, .. } => {
1474                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1475                unsafe {
1476                    positions
1477                        .as_flattened()
1478                        .get_unchecked(usize::from(position))
1479                        .assume_init()
1480                }
1481            }
1482        }
1483    }
1484}
1485
1486#[cfg(feature = "alloc")]
1487impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1488where
1489    Self: private::NotLastTable,
1490    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1491    [(); 1 << K]:,
1492    [(); num_buckets(K)]:,
1493    [(); num_buckets(K) - 1]:,
1494{
1495    /// Returns `None` for an invalid position or for table number 7.
1496    ///
1497    /// # Safety
1498    /// `position` must come from [`Self::buckets()`] and not be a sentinel value.
1499    #[inline(always)]
1500    unsafe fn metadata(&self, position: Position) -> Metadata<K, TABLE_NUMBER> {
1501        match self {
1502            Self::First { .. } => {
1503                // X matches position
1504                Metadata::from(X::from(u32::from(position)))
1505            }
1506            Self::Other { metadatas, .. } => {
1507                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1508                unsafe { metadatas.get_unchecked(usize::from(position)).assume_init() }
1509            }
1510            #[cfg(feature = "parallel")]
1511            Self::OtherBuckets { metadatas, .. } => {
1512                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1513                unsafe {
1514                    metadatas
1515                        .as_flattened()
1516                        .get_unchecked(usize::from(position))
1517                        .assume_init()
1518                }
1519            }
1520        }
1521    }
1522}
1523
1524#[cfg(feature = "alloc")]
1525impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1526where
1527    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1528    [(); 1 << K]:,
1529    [(); num_buckets(K)]:,
1530    [(); num_buckets(K) - 1]:,
1531{
1532    #[inline(always)]
1533    fn prune(self) -> PrunedTable<K, TABLE_NUMBER> {
1534        match self {
1535            Self::First { .. } => PrunedTable::First,
1536            Self::Other { positions, .. } => PrunedTable::Other { positions },
1537            #[cfg(feature = "parallel")]
1538            Self::OtherBuckets { positions, .. } => PrunedTable::OtherBuckets { positions },
1539        }
1540    }
1541
1542    /// Positions of `y`s grouped by the bucket they belong to
1543    #[inline(always)]
1544    pub(super) fn buckets(&self) -> &[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)] {
1545        match self {
1546            Self::First { buckets, .. } => buckets,
1547            Self::Other { buckets, .. } => buckets,
1548            #[cfg(feature = "parallel")]
1549            Self::OtherBuckets { buckets, .. } => buckets,
1550        }
1551    }
1552}