subspace_proof_of_space/chiapos/
table.rs

1//! `chiapos` table implementation.
2
3#[cfg(any(feature = "alloc", test))]
4mod rmap;
5#[cfg(test)]
6mod tests;
7pub(super) mod types;
8
9use crate::chiapos::Seed;
10use crate::chiapos::constants::{PARAM_B, PARAM_BC, PARAM_C, PARAM_EXT, PARAM_M};
11#[cfg(feature = "alloc")]
12use crate::chiapos::table::rmap::Rmap;
13use crate::chiapos::table::types::{Metadata, X, Y};
14#[cfg(feature = "alloc")]
15use crate::chiapos::table::types::{Position, R};
16use crate::chiapos::utils::EvaluatableUsize;
17use ab_chacha8::{ChaCha8Block, ChaCha8State};
18#[cfg(feature = "alloc")]
19use alloc::boxed::Box;
20#[cfg(feature = "alloc")]
21use alloc::vec;
22#[cfg(feature = "alloc")]
23use alloc::vec::Vec;
24#[cfg(feature = "alloc")]
25use chacha20::cipher::{Iv, KeyIvInit, StreamCipher};
26#[cfg(feature = "alloc")]
27use chacha20::{ChaCha8, Key};
28use core::array;
29#[cfg(feature = "parallel")]
30use core::cell::SyncUnsafeCell;
31#[cfg(feature = "alloc")]
32use core::mem;
33#[cfg(feature = "alloc")]
34use core::mem::MaybeUninit;
35#[cfg(any(feature = "alloc", test))]
36use core::simd::prelude::*;
37#[cfg(feature = "parallel")]
38use core::sync::atomic::{AtomicUsize, Ordering};
39#[cfg(feature = "alloc")]
40use derive_more::Deref;
41#[cfg(any(feature = "alloc", test))]
42use seq_macro::seq;
43// TODO: Switch to `rclite` once https://github.com/fereidani/rclite/issues/11 is resolved
44#[cfg(feature = "alloc")]
45use alloc::sync::Arc;
46use subspace_core_primitives::hashes::blake3_hash;
47
48pub(super) const COMPUTE_F1_SIMD_FACTOR: usize = 8;
49#[cfg(any(feature = "alloc", test))]
50const COMPUTE_FN_SIMD_FACTOR: usize = 16;
51#[allow(dead_code, reason = "unused when crate is compiled separately")]
52const MAX_BUCKET_SIZE: usize = 512;
53#[cfg(feature = "alloc")]
54const BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS: u8 = 128;
55/// Bucket size used for fixed-size bucketed arrays.
56///
57/// Must be `MAX_BUCKET_SIZE` to avoid silently dropping entries when a bucket's Y values exceed
58/// the limit. With `PARAM_BC / 2^PARAM_EXT ≈ 236` expected entries per bucket and std dev ≈ 15.4,
59/// the original value of 272 (~2.3 sigma) caused overflow in production (K=20), dropping hundreds
60/// of proofs and causing "Missing PoS proof" farming errors.
61#[allow(dead_code, reason = "unused when crate is compiled separately")]
62pub(crate) const REDUCED_BUCKET_SIZE: usize = MAX_BUCKET_SIZE;
63/// Matches count limit per bucket pair.
64///
65/// Must be `MAX_BUCKET_SIZE` to avoid silently dropping matches. The original value of 288 could
66/// cause early termination in `find_matches_in_buckets`, losing valid proof paths.
67#[allow(dead_code, reason = "unused when crate is compiled separately")]
68const REDUCED_MATCHES_COUNT: usize = MAX_BUCKET_SIZE;
69#[cfg(feature = "parallel")]
70const CACHE_LINE_SIZE: usize = 64;
71
72const _: () = {
73    debug_assert!(REDUCED_BUCKET_SIZE <= MAX_BUCKET_SIZE);
74    debug_assert!(REDUCED_MATCHES_COUNT <= MAX_BUCKET_SIZE);
75};
76
77/// Compute the size of `y` in bits
78pub(super) const fn y_size_bits(k: u8) -> usize {
79    k as usize + PARAM_EXT as usize
80}
81
82/// Metadata size in bytes
83pub const fn metadata_size_bytes(k: u8, table_number: u8) -> usize {
84    metadata_size_bits(k, table_number).div_ceil(u8::BITS as usize)
85}
86
87/// Metadata size in bits
88pub(super) const fn metadata_size_bits(k: u8, table_number: u8) -> usize {
89    k as usize
90        * match table_number {
91            1 => 1,
92            2 => 2,
93            3 | 4 => 4,
94            5 => 3,
95            6 => 2,
96            7 => 0,
97            _ => unreachable!(),
98        }
99}
100
101/// Number of buckets for a given `k`
102pub const fn num_buckets(k: u8) -> usize {
103    2_usize
104        .pow(y_size_bits(k) as u32)
105        .div_ceil(PARAM_BC as usize)
106}
107
108#[cfg(feature = "parallel")]
109#[inline(always)]
110fn strip_sync_unsafe_cell<const N: usize, T>(value: Box<[SyncUnsafeCell<T>; N]>) -> Box<[T; N]> {
111    // SAFETY: `SyncUnsafeCell` has the same layout as `T`
112    unsafe { Box::from_raw(Box::into_raw(value).cast()) }
113}
114
115/// ChaCha8 [`Vec`] sufficient for the whole first table for [`K`].
116/// Prefer [`partial_y`] if you need partial y just for a single `x`.
117#[cfg(feature = "alloc")]
118fn partial_ys<const K: u8>(seed: Seed) -> Vec<u8> {
119    let output_len_bits = usize::from(K) * (1 << K);
120    let mut output = vec![0; output_len_bits.div_ceil(u8::BITS as usize)];
121
122    let key = Key::from(seed);
123    let iv = Iv::<ChaCha8>::default();
124
125    let mut cipher = ChaCha8::new(&key, &iv);
126
127    cipher.apply_keystream(&mut output);
128
129    output
130}
131
132/// Calculate a probabilistic upper bound on the Chia bucket size for a given `k` and
133/// `security_bits` (security level).
134///
135/// This is based on a Chernoff bound for the Poisson distribution with mean
136/// `lambda = PARAM_BC / 2^PARAM_EXT`, ensuring the probability that any bucket exceeds the bound is
137/// less than `2^{-security_bits}`.
138/// The bound is lambda + ceil(sqrt(3 * lambda * (k + security_bits) * ln(2))).
139#[cfg(feature = "alloc")]
140const fn bucket_size_upper_bound(k: u8, security_bits: u8) -> usize {
141    // Lambda is the expected number of entries in a bucket, approximated as
142    // `PARAM_BC / 2^PARAM_EXT`. It is independent of `k`.
143    const LAMBDA: u64 = PARAM_BC as u64 / 2u64.pow(PARAM_EXT as u32);
144    // Approximation of ln(2) as a fraction: ln(2) ≈ LN2_NUM / LN2_DEN.
145    // This allows integer-only computation of the square root term involving ln(2).
146    const LN2_NUM: u128 = 693147;
147    const LN2_DEN: u128 = 1000000;
148
149    // `k + security_bits` for the union bound over ~2^k intervals
150    let ks = k as u128 + security_bits as u128;
151    // Compute numerator for the expression under the square root:
152    // `3 * lambda * (k + security_bits) * LN2_NUM`
153    let num = 3u128 * LAMBDA as u128 * ks * LN2_NUM;
154    // Denominator for ln(2): `LN2_DEN`
155    let den = LN2_DEN;
156
157    let ceil_div: u128 = num.div_ceil(den);
158
159    // Binary search to find the smallest `x` such that `x * x * den >= num`,
160    // which computes `ceil(sqrt(num / den))` without floating-point.
161    // We use a custom binary search over `u64` range because binary search in the standard library
162    // operates on sorted slices, not directly on integer ranges for solving inequalities like this.
163    let mut low = 0u64;
164    let mut high = u64::MAX;
165    while low < high {
166        let mid = low + (high - low) / 2;
167        let left = (mid as u128) * (mid as u128);
168        if left >= ceil_div {
169            high = mid;
170        } else {
171            low = mid + 1;
172        }
173    }
174    let add_term = low;
175
176    (LAMBDA + add_term) as usize
177}
178
179#[cfg(feature = "alloc")]
180fn group_by_buckets<const K: u8>(
181    ys: &[Y],
182) -> Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>
183where
184    [(); num_buckets(K)]:,
185{
186    let mut bucket_offsets = [0_u16; num_buckets(K)];
187    // SAFETY: Contents is `MaybeUninit`
188    let mut buckets = unsafe {
189        Box::<[[MaybeUninit<(Position, Y)>; REDUCED_BUCKET_SIZE]; num_buckets(K)]>::new_uninit()
190            .assume_init()
191    };
192
193    for (&y, position) in ys.iter().zip(Position::ZERO..) {
194        let bucket_index = (u32::from(y) / u32::from(PARAM_BC)) as usize;
195
196        // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
197        let bucket_offset = unsafe { bucket_offsets.get_unchecked_mut(bucket_index) };
198        // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
199        let bucket = unsafe { buckets.get_unchecked_mut(bucket_index) };
200
201        if *bucket_offset < REDUCED_BUCKET_SIZE as u16 {
202            bucket[*bucket_offset as usize].write((position, y));
203            *bucket_offset += 1;
204        }
205    }
206
207    for (bucket, initialized) in buckets.iter_mut().zip(bucket_offsets) {
208        bucket[usize::from(initialized)..].write_filled((Position::SENTINEL, Y::SENTINEL));
209    }
210
211    // SAFETY: All entries are initialized
212    unsafe { Box::from_raw(Box::into_raw(buckets).cast()) }
213}
214
215/// Similar to [`group_by_buckets()`], but processes buckets instead of a flat list of `y`s.
216///
217/// # Safety
218/// Iterator item is a list of potentially uninitialized `y`s and a number of initialized `y`s. The
219/// number of initialized `y`s must be correct.
220#[cfg(feature = "parallel")]
221unsafe fn group_by_buckets_from_buckets<'a, const K: u8, I>(
222    iter: I,
223) -> Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>
224where
225    I: Iterator<Item = (&'a [MaybeUninit<Y>; REDUCED_MATCHES_COUNT], usize)> + 'a,
226    [(); num_buckets(K)]:,
227{
228    let mut bucket_offsets = [0_u16; num_buckets(K)];
229    // SAFETY: Contents is `MaybeUninit`
230    let mut buckets = unsafe {
231        Box::<[[MaybeUninit<(Position, Y)>; REDUCED_BUCKET_SIZE]; num_buckets(K)]>::new_uninit()
232            .assume_init()
233    };
234
235    for ((ys, count), batch_start) in iter.zip((Position::ZERO..).step_by(REDUCED_MATCHES_COUNT)) {
236        // SAFETY: Function contract guarantees that `y`s are initialized
237        let ys = unsafe { ys[..count].assume_init_ref() };
238        for (&y, position) in ys.iter().zip(batch_start..) {
239            let bucket_index = (u32::from(y) / u32::from(PARAM_BC)) as usize;
240
241            // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
242            let bucket_offset = unsafe { bucket_offsets.get_unchecked_mut(bucket_index) };
243            // SAFETY: Bucket is obtained using division by `PARAM_BC` and fits by definition
244            let bucket = unsafe { buckets.get_unchecked_mut(bucket_index) };
245
246            if *bucket_offset < REDUCED_BUCKET_SIZE as u16 {
247                bucket[*bucket_offset as usize].write((position, y));
248                *bucket_offset += 1;
249            }
250        }
251    }
252
253    for (bucket, initialized) in buckets.iter_mut().zip(bucket_offsets) {
254        bucket[usize::from(initialized)..].write_filled((Position::SENTINEL, Y::SENTINEL));
255    }
256
257    // SAFETY: All entries are initialized
258    unsafe { Box::from_raw(Box::into_raw(buckets).cast()) }
259}
260
261/// Sort entries within each bucket by Y value to ensure deterministic proof ordering.
262/// This matches the old code's behavior where entries were globally Y-sorted before bucketing.
263#[cfg(feature = "alloc")]
264fn sort_buckets<const K: u8>(buckets: &mut [[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)])
265where
266    [(); num_buckets(K)]:,
267{
268    for bucket in buckets.iter_mut() {
269        let len = bucket
270            .iter()
271            .take_while(|&&(pos, _)| pos != Position::SENTINEL)
272            .count();
273        bucket[..len].sort_unstable_by_key(|&(pos, y)| (u32::from(y), u32::from(pos)));
274    }
275}
276
277#[cfg(feature = "alloc")]
278#[derive(Debug, Copy, Clone, Deref)]
279#[repr(align(64))]
280struct CacheLineAligned<T>(T);
281
282/// Mapping from `parity` to `r` to `m`
283#[cfg(feature = "alloc")]
284type LeftTargets = [[CacheLineAligned<[R; PARAM_M as usize]>; PARAM_BC as usize]; 2];
285
286#[cfg(feature = "alloc")]
287fn calculate_left_targets() -> Arc<LeftTargets> {
288    let mut left_targets = Arc::<LeftTargets>::new_uninit();
289    // SAFETY: Same layout and uninitialized in both cases
290    let left_targets_slice = unsafe {
291        mem::transmute::<
292            &mut MaybeUninit<[[CacheLineAligned<[R; PARAM_M as usize]>; PARAM_BC as usize]; 2]>,
293            &mut [[MaybeUninit<CacheLineAligned<[R; PARAM_M as usize]>>; PARAM_BC as usize]; 2],
294        >(Arc::get_mut_unchecked(&mut left_targets))
295    };
296
297    for parity in 0..=1 {
298        for r in 0..PARAM_BC {
299            let c = r / PARAM_C;
300
301            let mut arr = array::from_fn(|m| {
302                let m = m as u16;
303                R::from(
304                    ((c + m) % PARAM_B) * PARAM_C
305                        + (((2 * m + parity) * (2 * m + parity) + r) % PARAM_C),
306                )
307            });
308            arr.sort_unstable();
309            left_targets_slice[parity as usize][r as usize].write(CacheLineAligned(arr));
310        }
311    }
312
313    // SAFETY: Initialized all entries
314    unsafe { left_targets.assume_init() }
315}
316
317fn calculate_left_target_on_demand(parity: u32, r: u32, m: u32) -> u32 {
318    let param_b = u32::from(PARAM_B);
319    let param_c = u32::from(PARAM_C);
320
321    ((r / param_c + m) % param_b) * param_c + (((2 * m + parity) * (2 * m + parity) + r) % param_c)
322}
323
324/// Caches that can be used to optimize the creation of multiple [`Tables`](super::Tables).
325#[cfg(feature = "alloc")]
326#[derive(Debug, Clone)]
327pub struct TablesCache {
328    left_targets: Arc<LeftTargets>,
329}
330
331#[cfg(feature = "alloc")]
332impl Default for TablesCache {
333    /// Create a new instance
334    fn default() -> Self {
335        Self {
336            left_targets: calculate_left_targets(),
337        }
338    }
339}
340
341#[cfg(feature = "alloc")]
342#[derive(Debug, Copy, Clone)]
343struct Match {
344    left_position: Position,
345    left_y: Y,
346    right_position: Position,
347}
348
349/// `partial_y_offset` is in bits within `partial_y`
350pub(super) fn compute_f1<const K: u8>(x: X, seed: &Seed) -> Y {
351    let skip_bits = u32::from(K) * u32::from(x);
352    let skip_u32s = skip_bits / u32::BITS;
353    let partial_y_offset = skip_bits % u32::BITS;
354
355    const U32S_PER_BLOCK: usize = size_of::<ChaCha8Block>() / size_of::<u32>();
356
357    let initial_state = ChaCha8State::init(seed, &[0; _]);
358    let first_block_counter = skip_u32s / U32S_PER_BLOCK as u32;
359    let u32_in_first_block = skip_u32s as usize % U32S_PER_BLOCK;
360
361    let first_block = initial_state.compute_block(first_block_counter);
362    let hi = first_block[u32_in_first_block].to_be();
363
364    // TODO: Is SIMD version of `compute_block()` that produces two blocks at once possible?
365    let lo = if u32_in_first_block + 1 == U32S_PER_BLOCK {
366        // Spilled over into the second block
367        let second_block = initial_state.compute_block(first_block_counter + 1);
368        second_block[0].to_be()
369    } else {
370        first_block[u32_in_first_block + 1].to_be()
371    };
372
373    let partial_y = (u64::from(hi) << u32::BITS) | u64::from(lo);
374
375    let pre_y = partial_y >> (u64::BITS - u32::from(K + PARAM_EXT) - partial_y_offset);
376    let pre_y = pre_y as u32;
377    // Mask for clearing the rest of bits of `pre_y`.
378    let pre_y_mask = (u32::MAX << PARAM_EXT) & (u32::MAX >> (u32::BITS - u32::from(K + PARAM_EXT)));
379
380    // Extract `PARAM_EXT` most significant bits from `x` and store in the final offset of
381    // eventual `y` with the rest of bits being zero (`x` is `0..2^K`)
382    let pre_ext = u32::from(x) >> (K - PARAM_EXT);
383
384    // Combine all of the bits together:
385    // [padding zero bits][`K` bits rom `partial_y`][`PARAM_EXT` bits from `x`]
386    Y::from((pre_y & pre_y_mask) | pre_ext)
387}
388
389#[cfg(any(feature = "alloc", test))]
390pub(super) fn compute_f1_simd<const K: u8>(
391    xs: Simd<u32, COMPUTE_F1_SIMD_FACTOR>,
392    partial_ys: &[u8; K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize],
393) -> [Y; COMPUTE_F1_SIMD_FACTOR] {
394    // Each element contains `K` desired bits of `partial_ys` in the final offset of eventual `ys`
395    // with the rest of bits being in undefined state
396    let pre_ys_bytes = array::from_fn(|i| {
397        let partial_y_offset = i * usize::from(K);
398        let partial_y_length =
399            (partial_y_offset % u8::BITS as usize + usize::from(K)).div_ceil(u8::BITS as usize);
400        let mut pre_y_bytes = 0u64.to_be_bytes();
401        pre_y_bytes[..partial_y_length].copy_from_slice(
402            &partial_ys[partial_y_offset / u8::BITS as usize..][..partial_y_length],
403        );
404
405        u64::from_be_bytes(pre_y_bytes)
406    });
407    let pre_ys_right_offset = array::from_fn(|i| {
408        let partial_y_offset = i as u32 * u32::from(K);
409        u64::from(u64::BITS - u32::from(K + PARAM_EXT) - partial_y_offset % u8::BITS)
410    });
411    let pre_ys = Simd::from_array(pre_ys_bytes) >> Simd::from_array(pre_ys_right_offset);
412
413    // Mask for clearing the rest of bits of `pre_ys`.
414    let pre_ys_mask = Simd::splat(
415        (u32::MAX << usize::from(PARAM_EXT))
416            & (u32::MAX >> (u32::BITS as usize - usize::from(K + PARAM_EXT))),
417    );
418
419    // Extract `PARAM_EXT` most significant bits from `xs` and store in the final offset of
420    // eventual `ys` with the rest of bits being in undefined state.
421    let pre_exts = xs >> Simd::splat(u32::from(K - PARAM_EXT));
422
423    // Combine all of the bits together:
424    // [padding zero bits][`K` bits rom `partial_y`][`PARAM_EXT` bits from `x`]
425    let ys = (pre_ys.cast() & pre_ys_mask) | pre_exts;
426
427    Y::array_from_repr(ys.to_array())
428}
429
430/// For verification use [`has_match`] instead.
431///
432/// # Safety
433/// Left and right bucket positions must correspond to the parent table.
434// TODO: Try to reduce the `matches` size further by processing `left_bucket` in chunks (like halves
435//  for example)
436#[cfg(feature = "alloc")]
437unsafe fn find_matches_in_buckets<'a>(
438    left_bucket_index: u32,
439    left_bucket: &[(Position, Y); REDUCED_BUCKET_SIZE],
440    right_bucket: &[(Position, Y); REDUCED_BUCKET_SIZE],
441    // `PARAM_M as usize * 2` corresponds to the upper bound number of matches a single `y` in the
442    // left bucket might have here
443    matches: &'a mut [MaybeUninit<Match>; REDUCED_MATCHES_COUNT + PARAM_M as usize * 2],
444    left_targets: &LeftTargets,
445) -> &'a [Match] {
446    let left_base = left_bucket_index * u32::from(PARAM_BC);
447    let right_base = left_base + u32::from(PARAM_BC);
448
449    let mut rmap = Rmap::new();
450    for &(right_position, y) in right_bucket {
451        if right_position == Position::SENTINEL {
452            break;
453        }
454        let r = R::from((u32::from(y) - right_base) as u16);
455        // SAFETY: `r` is within `0..PARAM_BC` range by definition, the right bucket is limited to
456        // `REDUCED_BUCKETS_SIZE`
457        unsafe {
458            rmap.add(r, right_position);
459        }
460    }
461
462    let parity = left_base % 2;
463    let left_targets_parity = &left_targets[parity as usize];
464    let mut next_match_index = 0;
465
466    // TODO: Simd read for left bucket? It might be more efficient in terms of memory access to
467    //  process chunks of the left bucket against one right value for each at a time
468    for &(left_position, y) in left_bucket {
469        // `next_match_index >= REDUCED_MATCHES_COUNT` is crucial to make sure
470        if left_position == Position::SENTINEL || next_match_index >= REDUCED_MATCHES_COUNT {
471            // Sentinel values are padded to the end of the bucket
472            break;
473        }
474
475        let r = R::from((u32::from(y) - left_base) as u16);
476        // SAFETY: `r` is within a bucket and exists by definition
477        let left_targets_r = unsafe { left_targets_parity.get_unchecked(usize::from(r)) };
478
479        for &r_target in left_targets_r.iter() {
480            // SAFETY: Targets are always limited to `PARAM_BC`
481            let right_positions = unsafe { rmap.get(r_target) };
482
483            for &right_position in right_positions {
484                if next_match_index >= matches.len() {
485                    break;
486                }
487                // SAFETY: Bounds checked above
488                unsafe { matches.get_unchecked_mut(next_match_index) }.write(Match {
489                    left_position,
490                    left_y: y,
491                    right_position,
492                });
493                next_match_index += 1;
494            }
495        }
496    }
497
498    // SAFETY: Initialized this many matches
499    unsafe { matches[..next_match_index].assume_init_ref() }
500}
501
502/// Simplified version of [`find_matches_in_buckets`] for verification purposes.
503pub(super) fn has_match(left_y: Y, right_y: Y) -> bool {
504    let right_r = u32::from(right_y) % u32::from(PARAM_BC);
505    let parity = (u32::from(left_y) / u32::from(PARAM_BC)) % 2;
506    let left_r = u32::from(left_y) % u32::from(PARAM_BC);
507
508    let r_targets = array::from_fn::<_, { PARAM_M as usize }, _>(|i| {
509        calculate_left_target_on_demand(parity, left_r, i as u32)
510    });
511
512    r_targets.contains(&right_r)
513}
514
515#[inline(always)]
516pub(super) fn compute_fn<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
517    y: Y,
518    left_metadata: Metadata<K, PARENT_TABLE_NUMBER>,
519    right_metadata: Metadata<K, PARENT_TABLE_NUMBER>,
520) -> (Y, Metadata<K, TABLE_NUMBER>)
521where
522    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
523    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
524{
525    let left_metadata = u128::from(left_metadata);
526    let right_metadata = u128::from(right_metadata);
527
528    let parent_metadata_bits = metadata_size_bits(K, PARENT_TABLE_NUMBER);
529
530    // Part of the `right_bits` at the final offset of eventual `input_a`
531    let y_and_left_bits = y_size_bits(K) + parent_metadata_bits;
532    let right_bits_start_offset = u128::BITS as usize - parent_metadata_bits;
533
534    // Take only bytes where bits were set
535    let num_bytes_with_data =
536        (y_size_bits(K) + parent_metadata_bits * 2).div_ceil(u8::BITS as usize);
537
538    // Only supports `K` from 15 to 25 (otherwise math will not be correct when concatenating y,
539    // left metadata and right metadata)
540    let hash = {
541        // Collect `K` most significant bits of `y` at the final offset of eventual `input_a`
542        let y_bits = u128::from(y) << (u128::BITS as usize - y_size_bits(K));
543
544        // Move bits of `left_metadata` at the final offset of eventual `input_a`
545        let left_metadata_bits =
546            left_metadata << (u128::BITS as usize - parent_metadata_bits - y_size_bits(K));
547
548        // If `right_metadata` bits start to the left of the desired position in `input_a` move
549        // bits right, else move left
550        if right_bits_start_offset < y_and_left_bits {
551            let right_bits_pushed_into_input_b = y_and_left_bits - right_bits_start_offset;
552            // Collect bits of `right_metadata` that will fit into `input_a` at the final offset in
553            // eventual `input_a`
554            let right_bits_a = right_metadata >> right_bits_pushed_into_input_b;
555            let input_a = y_bits | left_metadata_bits | right_bits_a;
556            // Collect bits of `right_metadata` that will spill over into `input_b`
557            let input_b = right_metadata << (u128::BITS as usize - right_bits_pushed_into_input_b);
558
559            let input = [input_a.to_be_bytes(), input_b.to_be_bytes()];
560            let input_len =
561                size_of::<u128>() + right_bits_pushed_into_input_b.div_ceil(u8::BITS as usize);
562            blake3_hash(&input.as_flattened()[..input_len])
563        } else {
564            let right_bits_a = right_metadata << (right_bits_start_offset - y_and_left_bits);
565            let input_a = y_bits | left_metadata_bits | right_bits_a;
566
567            blake3_hash(&input_a.to_be_bytes()[..num_bytes_with_data])
568        }
569    };
570
571    let y_output = Y::from(
572        u32::from_be_bytes([hash[0], hash[1], hash[2], hash[3]])
573            >> (u32::BITS as usize - y_size_bits(K)),
574    );
575
576    let metadata_size_bits = metadata_size_bits(K, TABLE_NUMBER);
577
578    let metadata = if TABLE_NUMBER < 4 {
579        (left_metadata << parent_metadata_bits) | right_metadata
580    } else if metadata_size_bits > 0 {
581        // For K up to 25 it is guaranteed that metadata + bit offset will always fit into u128.
582        // We collect the bytes necessary, potentially with extra bits at the start and end of the
583        // bytes that will be taken care of later.
584        let metadata = u128::from_be_bytes(
585            hash[y_size_bits(K) / u8::BITS as usize..][..size_of::<u128>()]
586                .try_into()
587                .expect("Always enough bits for any K; qed"),
588        );
589        // Remove extra bits at the beginning
590        let metadata = metadata << (y_size_bits(K) % u8::BITS as usize);
591        // Move bits into the correct location
592        metadata >> (u128::BITS as usize - metadata_size_bits)
593    } else {
594        0
595    };
596
597    (y_output, Metadata::from(metadata))
598}
599
600// TODO: This is actually using only pipelining rather than real SIMD (at least explicitly) due to:
601//  * https://github.com/rust-lang/portable-simd/issues/108
602//  * https://github.com/BLAKE3-team/BLAKE3/issues/478#issuecomment-3200106103
603#[cfg(any(feature = "alloc", test))]
604fn compute_fn_simd<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
605    left_ys: [Y; COMPUTE_FN_SIMD_FACTOR],
606    left_metadatas: [Metadata<K, PARENT_TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
607    right_metadatas: [Metadata<K, PARENT_TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
608) -> (
609    [Y; COMPUTE_FN_SIMD_FACTOR],
610    [Metadata<K, TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
611)
612where
613    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
614    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
615{
616    let parent_metadata_bits = metadata_size_bits(K, PARENT_TABLE_NUMBER);
617    let metadata_size_bits = metadata_size_bits(K, TABLE_NUMBER);
618
619    // TODO: `u128` is not supported as SIMD element yet, see
620    //  https://github.com/rust-lang/portable-simd/issues/108
621    let left_metadatas: [u128; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
622        [
623        #(
624            u128::from(left_metadatas[N]),
625        )*
626        ]
627    });
628    let right_metadatas: [u128; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
629        [
630        #(
631            u128::from(right_metadatas[N]),
632        )*
633        ]
634    });
635
636    // Part of the `right_bits` at the final offset of eventual `input_a`
637    let y_and_left_bits = y_size_bits(K) + parent_metadata_bits;
638    let right_bits_start_offset = u128::BITS as usize - parent_metadata_bits;
639
640    // Take only bytes where bits were set
641    let num_bytes_with_data =
642        (y_size_bits(K) + parent_metadata_bits * 2).div_ceil(u8::BITS as usize);
643
644    // Only supports `K` from 15 to 25 (otherwise math will not be correct when concatenating y,
645    // left metadata and right metadata)
646    // TODO: SIMD hashing once this is possible:
647    //  https://github.com/BLAKE3-team/BLAKE3/issues/478#issuecomment-3200106103
648    let hashes: [_; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
649        [
650        #(
651        {
652            let y = left_ys[N];
653            let left_metadata = left_metadatas[N];
654            let right_metadata = right_metadatas[N];
655
656            // Collect `K` most significant bits of `y` at the final offset of eventual
657            // `input_a`
658            let y_bits = u128::from(y) << (u128::BITS as usize - y_size_bits(K));
659
660            // Move bits of `left_metadata` at the final offset of eventual `input_a`
661            let left_metadata_bits =
662                left_metadata << (u128::BITS as usize - parent_metadata_bits - y_size_bits(K));
663
664            // If `right_metadata` bits start to the left of the desired position in `input_a` move
665            // bits right, else move left
666            if right_bits_start_offset < y_and_left_bits {
667                let right_bits_pushed_into_input_b = y_and_left_bits - right_bits_start_offset;
668                // Collect bits of `right_metadata` that will fit into `input_a` at the final offset
669                // in eventual `input_a`
670                let right_bits_a = right_metadata >> right_bits_pushed_into_input_b;
671                let input_a = y_bits | left_metadata_bits | right_bits_a;
672                // Collect bits of `right_metadata` that will spill over into `input_b`
673                let input_b = right_metadata << (u128::BITS as usize - right_bits_pushed_into_input_b);
674
675                let input = [input_a.to_be_bytes(), input_b.to_be_bytes()];
676                let input_len =
677                    size_of::<u128>() + right_bits_pushed_into_input_b.div_ceil(u8::BITS as usize);
678                blake3_hash(&input.as_flattened()[..input_len])
679            } else {
680                let right_bits_a = right_metadata << (right_bits_start_offset - y_and_left_bits);
681                let input_a = y_bits | left_metadata_bits | right_bits_a;
682
683                blake3_hash(&input_a.to_be_bytes()[..num_bytes_with_data])
684            }
685        },
686        )*
687        ]
688    });
689
690    let y_outputs = Simd::from_array(
691        hashes.map(|hash| u32::from_be_bytes([hash[0], hash[1], hash[2], hash[3]])),
692    ) >> (u32::BITS - y_size_bits(K) as u32);
693    let y_outputs = Y::array_from_repr(y_outputs.to_array());
694
695    let metadatas = if TABLE_NUMBER < 4 {
696        seq!(N in 0..16 {
697            [
698            #(
699                Metadata::from((left_metadatas[N] << parent_metadata_bits) | right_metadatas[N]),
700            )*
701            ]
702        })
703    } else if metadata_size_bits > 0 {
704        // For K up to 25 it is guaranteed that metadata + bit offset will always fit into u128.
705        // We collect the bytes necessary, potentially with extra bits at the start and end of the
706        // bytes that will be taken care of later.
707        seq!(N in 0..16 {
708            [
709            #(
710            {
711                let metadata = u128::from_be_bytes(
712                    hashes[N][y_size_bits(K) / u8::BITS as usize..][..size_of::<u128>()]
713                        .try_into()
714                        .expect("Always enough bits for any K; qed"),
715                );
716                // Remove extra bits at the beginning
717                let metadata = metadata << (y_size_bits(K) % u8::BITS as usize);
718                // Move bits into the correct location
719                Metadata::from(metadata >> (u128::BITS as usize - metadata_size_bits))
720            },
721            )*
722            ]
723        })
724    } else {
725        [Metadata::default(); _]
726    };
727
728    (y_outputs, metadatas)
729}
730
731/// # Safety
732/// `m` must contain positions that correspond to the parent table
733#[cfg(feature = "alloc")]
734unsafe fn match_to_result<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
735    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
736    m: &Match,
737) -> (Y, [Position; 2], Metadata<K, TABLE_NUMBER>)
738where
739    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
740    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
741    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
742    [(); 1 << K]:,
743    [(); num_buckets(K)]:,
744    [(); num_buckets(K) - 1]:,
745{
746    // SAFETY: Guaranteed by function contract
747    let left_metadata = unsafe { parent_table.metadata(m.left_position) };
748    // SAFETY: Guaranteed by function contract
749    let right_metadata = unsafe { parent_table.metadata(m.right_position) };
750
751    let (y, metadata) =
752        compute_fn::<K, TABLE_NUMBER, PARENT_TABLE_NUMBER>(m.left_y, left_metadata, right_metadata);
753
754    (y, [m.left_position, m.right_position], metadata)
755}
756
757/// # Safety
758/// `matches` must contain positions that correspond to the parent table
759#[cfg(feature = "alloc")]
760#[inline(always)]
761unsafe fn match_to_result_simd<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
762    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
763    matches: &[Match; COMPUTE_FN_SIMD_FACTOR],
764) -> (
765    [Y; COMPUTE_FN_SIMD_FACTOR],
766    [[Position; 2]; COMPUTE_FN_SIMD_FACTOR],
767    [Metadata<K, TABLE_NUMBER>; COMPUTE_FN_SIMD_FACTOR],
768)
769where
770    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
771    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
772    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
773    [(); 1 << K]:,
774    [(); num_buckets(K)]:,
775    [(); num_buckets(K) - 1]:,
776{
777    let left_ys: [_; COMPUTE_FN_SIMD_FACTOR] = seq!(N in 0..16 {
778        [
779        #(
780            matches[N].left_y,
781        )*
782        ]
783    });
784    // SAFETY: Guaranteed by function contract
785    let left_metadatas: [_; COMPUTE_FN_SIMD_FACTOR] = unsafe {
786        seq!(N in 0..16 {
787            [
788            #(
789                parent_table.metadata(matches[N].left_position),
790            )*
791            ]
792        })
793    };
794    // SAFETY: Guaranteed by function contract
795    let right_metadatas: [_; COMPUTE_FN_SIMD_FACTOR] = unsafe {
796        seq!(N in 0..16 {
797            [
798            #(
799                parent_table.metadata(matches[N].right_position),
800            )*
801            ]
802        })
803    };
804
805    let (y_outputs, metadatas) = compute_fn_simd::<K, TABLE_NUMBER, PARENT_TABLE_NUMBER>(
806        left_ys,
807        left_metadatas,
808        right_metadatas,
809    );
810
811    let positions = seq!(N in 0..16 {
812        [
813        #(
814            [
815                matches[N].left_position,
816                matches[N].right_position,
817            ],
818        )*
819        ]
820    });
821
822    (y_outputs, positions, metadatas)
823}
824
825/// # Safety
826/// `matches` must contain positions that correspond to the parent table. `ys`, `position` and
827/// `metadatas` length must be at least the length of `matches`
828#[cfg(feature = "alloc")]
829#[inline(always)]
830unsafe fn matches_to_results<const K: u8, const TABLE_NUMBER: u8, const PARENT_TABLE_NUMBER: u8>(
831    parent_table: &Table<K, PARENT_TABLE_NUMBER>,
832    matches: &[Match],
833    ys: &mut [MaybeUninit<Y>],
834    positions: &mut [MaybeUninit<[Position; 2]>],
835    metadatas: &mut [MaybeUninit<Metadata<K, TABLE_NUMBER>>],
836) where
837    Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
838    EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
839    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
840    [(); 1 << K]:,
841    [(); num_buckets(K)]:,
842    [(); num_buckets(K) - 1]:,
843{
844    let (grouped_matches, other_matches) = matches.as_chunks::<COMPUTE_FN_SIMD_FACTOR>();
845    let (grouped_ys, other_ys) = ys.split_at_mut(grouped_matches.as_flattened().len());
846    let grouped_ys = grouped_ys.as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>().0;
847    let (grouped_positions, other_positions) =
848        positions.split_at_mut(grouped_matches.as_flattened().len());
849    let grouped_positions = grouped_positions
850        .as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>()
851        .0;
852    let (grouped_metadatas, other_metadatas) =
853        metadatas.split_at_mut(grouped_matches.as_flattened().len());
854    let grouped_metadatas = grouped_metadatas
855        .as_chunks_mut::<COMPUTE_FN_SIMD_FACTOR>()
856        .0;
857
858    for (((grouped_matches, grouped_ys), grouped_positions), grouped_metadatas) in grouped_matches
859        .iter()
860        .zip(grouped_ys)
861        .zip(grouped_positions)
862        .zip(grouped_metadatas)
863    {
864        // SAFETY: Guaranteed by function contract
865        let (ys_group, positions_group, metadatas_group) =
866            unsafe { match_to_result_simd(parent_table, grouped_matches) };
867        grouped_ys.write_copy_of_slice(&ys_group);
868        grouped_positions.write_copy_of_slice(&positions_group);
869
870        // The last table doesn't have metadata
871        if metadata_size_bits(K, TABLE_NUMBER) > 0 {
872            grouped_metadatas.write_copy_of_slice(&metadatas_group);
873        }
874    }
875    for (((other_match, other_y), other_positions), other_metadata) in other_matches
876        .iter()
877        .zip(other_ys)
878        .zip(other_positions)
879        .zip(other_metadatas)
880    {
881        // SAFETY: Guaranteed by function contract
882        let (y, p, metadata) = unsafe { match_to_result(parent_table, other_match) };
883        other_y.write(y);
884        other_positions.write(p);
885        // The last table doesn't have metadata
886        if metadata_size_bits(K, TABLE_NUMBER) > 0 {
887            other_metadata.write(metadata);
888        }
889    }
890}
891
892/// Similar to [`Table`], but smaller size for later processing stages
893#[cfg(feature = "alloc")]
894#[derive(Debug)]
895pub(super) enum PrunedTable<const K: u8, const TABLE_NUMBER: u8>
896where
897    [(); 1 << K]:,
898    [(); num_buckets(K) - 1]:,
899{
900    First,
901    /// Other tables
902    Other {
903        /// Left and right entry positions in a previous table encoded into bits
904        positions: Box<[MaybeUninit<[Position; 2]>; 1 << K]>,
905    },
906    /// Other tables
907    #[cfg(feature = "parallel")]
908    OtherBuckets {
909        /// Left and right entry positions in a previous table encoded into bits.
910        ///
911        /// Only positions from the `buckets` field are guaranteed to be initialized.
912        positions: Box<[[MaybeUninit<[Position; 2]>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1]>,
913    },
914}
915
916#[cfg(feature = "alloc")]
917impl<const K: u8, const TABLE_NUMBER: u8> PrunedTable<K, TABLE_NUMBER>
918where
919    [(); 1 << K]:,
920    [(); num_buckets(K) - 1]:,
921{
922    /// Get `[left_position, right_position]` of a previous table for a specified position in a
923    /// current table.
924    ///
925    /// # Safety
926    /// `position` must come from [`Table::buckets()`] or [`Self::position()`] and not be a sentinel
927    /// value.
928    #[inline(always)]
929    pub(super) unsafe fn position(&self, position: Position) -> [Position; 2] {
930        match self {
931            Self::First => {
932                unreachable!("Not the first table");
933            }
934            Self::Other { positions, .. } => {
935                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
936                unsafe { positions.get_unchecked(usize::from(position)).assume_init() }
937            }
938            #[cfg(feature = "parallel")]
939            Self::OtherBuckets { positions, .. } => {
940                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
941                unsafe {
942                    positions
943                        .as_flattened()
944                        .get_unchecked(usize::from(position))
945                        .assume_init()
946                }
947            }
948        }
949    }
950}
951
952#[cfg(feature = "alloc")]
953#[derive(Debug)]
954pub(super) enum Table<const K: u8, const TABLE_NUMBER: u8>
955where
956    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
957    [(); 1 << K]:,
958    [(); num_buckets(K)]:,
959    [(); num_buckets(K) - 1]:,
960{
961    /// First table
962    First {
963        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
964        ///
965        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
966        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
967    },
968    /// Other tables
969    Other {
970        /// Left and right entry positions in a previous table encoded into bits
971        positions: Box<[MaybeUninit<[Position; 2]>; 1 << K]>,
972        /// Metadata corresponding to each entry
973        metadatas: Box<[MaybeUninit<Metadata<K, TABLE_NUMBER>>; 1 << K]>,
974        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
975        ///
976        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
977        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
978    },
979    /// Other tables
980    #[cfg(feature = "parallel")]
981    OtherBuckets {
982        /// Left and right entry positions in a previous table encoded into bits.
983        ///
984        /// Only positions from the `buckets` field are guaranteed to be initialized.
985        positions: Box<[[MaybeUninit<[Position; 2]>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1]>,
986        /// Metadata corresponding to each entry.
987        ///
988        /// Only positions from the `buckets` field are guaranteed to be initialized.
989        metadatas: Box<
990            [[MaybeUninit<Metadata<K, TABLE_NUMBER>>; REDUCED_MATCHES_COUNT]; num_buckets(K) - 1],
991        >,
992        /// Each bucket contains positions of `Y` values that belong to it and corresponding `y`.
993        ///
994        /// Buckets are padded with sentinel values to `REDUCED_BUCKETS_SIZE`.
995        buckets: Box<[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)]>,
996    },
997}
998
999#[cfg(feature = "alloc")]
1000impl<const K: u8> Table<K, 1>
1001where
1002    EvaluatableUsize<{ metadata_size_bytes(K, 1) }>: Sized,
1003    [(); 1 << K]:,
1004    [(); num_buckets(K)]:,
1005    [(); num_buckets(K) - 1]:,
1006{
1007    /// Create the table
1008    pub(super) fn create(seed: Seed) -> Self
1009    where
1010        EvaluatableUsize<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>: Sized,
1011    {
1012        // `MAX_BUCKET_SIZE` is not actively used, but is an upper-bound reference for the other
1013        // parameters
1014        debug_assert!(
1015            MAX_BUCKET_SIZE >= bucket_size_upper_bound(K, BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS),
1016            "Max bucket size is not sufficiently large"
1017        );
1018
1019        let partial_ys = partial_ys::<K>(seed);
1020
1021        // SAFETY: Contents is `MaybeUninit`
1022        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1023
1024        for ((ys, xs_batch_start), partial_ys) in ys
1025            .as_chunks_mut::<COMPUTE_F1_SIMD_FACTOR>()
1026            .0
1027            .iter_mut()
1028            .zip((X::ZERO..).step_by(COMPUTE_F1_SIMD_FACTOR))
1029            .zip(
1030                partial_ys
1031                    .as_chunks::<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>()
1032                    .0,
1033            )
1034        {
1035            let xs = Simd::splat(u32::from(xs_batch_start))
1036                + Simd::from_array(array::from_fn(|i| i as u32));
1037            let ys_batch = compute_f1_simd::<K>(xs, partial_ys);
1038
1039            ys.write_copy_of_slice(&ys_batch);
1040        }
1041
1042        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1043        // layout, all elements were initialized
1044        let ys = unsafe {
1045            let ys_len = ys.len();
1046            let ys = Box::into_raw(ys);
1047            Vec::from_raw_parts(ys.cast(), ys_len, ys_len)
1048        };
1049
1050        // TODO: Try to group buckets in the process of collecting `y`s
1051        let mut buckets = group_by_buckets::<K>(&ys);
1052        sort_buckets::<K>(&mut buckets);
1053
1054        Self::First { buckets }
1055    }
1056
1057    /// Create the table, leverages available parallelism
1058    #[cfg(feature = "parallel")]
1059    pub(super) fn create_parallel(seed: Seed) -> Self
1060    where
1061        EvaluatableUsize<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>: Sized,
1062    {
1063        // `MAX_BUCKET_SIZE` is not actively used, but is an upper-bound reference for the other
1064        // parameters
1065        debug_assert!(
1066            MAX_BUCKET_SIZE >= bucket_size_upper_bound(K, BUCKET_SIZE_UPPER_BOUND_SECURITY_BITS),
1067            "Max bucket size is not sufficiently large"
1068        );
1069
1070        let partial_ys = partial_ys::<K>(seed);
1071
1072        // SAFETY: Contents is `MaybeUninit`
1073        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1074
1075        // TODO: Try parallelism here?
1076        for ((ys, xs_batch_start), partial_ys) in ys
1077            .as_chunks_mut::<COMPUTE_F1_SIMD_FACTOR>()
1078            .0
1079            .iter_mut()
1080            .zip((X::ZERO..).step_by(COMPUTE_F1_SIMD_FACTOR))
1081            .zip(
1082                partial_ys
1083                    .as_chunks::<{ K as usize * COMPUTE_F1_SIMD_FACTOR / u8::BITS as usize }>()
1084                    .0,
1085            )
1086        {
1087            let xs = Simd::splat(u32::from(xs_batch_start))
1088                + Simd::from_array(array::from_fn(|i| i as u32));
1089            let ys_batch = compute_f1_simd::<K>(xs, partial_ys);
1090
1091            ys.write_copy_of_slice(&ys_batch);
1092        }
1093
1094        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1095        // layout, all elements were initialized
1096        let ys = unsafe {
1097            let ys_len = ys.len();
1098            let ys = Box::into_raw(ys);
1099            Vec::from_raw_parts(ys.cast(), ys_len, ys_len)
1100        };
1101
1102        // TODO: Try to group buckets in the process of collecting `y`s
1103        let mut buckets = group_by_buckets::<K>(&ys);
1104        sort_buckets::<K>(&mut buckets);
1105
1106        Self::First { buckets }
1107    }
1108}
1109
1110#[cfg(feature = "alloc")]
1111mod private {
1112    pub(in super::super) trait SupportedOtherTables {}
1113    pub(in super::super) trait NotLastTable {}
1114}
1115
1116#[cfg(feature = "alloc")]
1117impl<const K: u8> private::SupportedOtherTables for Table<K, 2>
1118where
1119    EvaluatableUsize<{ metadata_size_bytes(K, 2) }>: Sized,
1120    [(); 1 << K]:,
1121    [(); num_buckets(K)]:,
1122    [(); num_buckets(K) - 1]:,
1123{
1124}
1125#[cfg(feature = "alloc")]
1126impl<const K: u8> private::SupportedOtherTables for Table<K, 3>
1127where
1128    EvaluatableUsize<{ metadata_size_bytes(K, 3) }>: Sized,
1129    [(); 1 << K]:,
1130    [(); num_buckets(K)]:,
1131    [(); num_buckets(K) - 1]:,
1132{
1133}
1134#[cfg(feature = "alloc")]
1135impl<const K: u8> private::SupportedOtherTables for Table<K, 4>
1136where
1137    EvaluatableUsize<{ metadata_size_bytes(K, 4) }>: Sized,
1138    [(); 1 << K]:,
1139    [(); num_buckets(K)]:,
1140    [(); num_buckets(K) - 1]:,
1141{
1142}
1143#[cfg(feature = "alloc")]
1144impl<const K: u8> private::SupportedOtherTables for Table<K, 5>
1145where
1146    EvaluatableUsize<{ metadata_size_bytes(K, 5) }>: Sized,
1147    [(); 1 << K]:,
1148    [(); num_buckets(K)]:,
1149    [(); num_buckets(K) - 1]:,
1150{
1151}
1152#[cfg(feature = "alloc")]
1153impl<const K: u8> private::SupportedOtherTables for Table<K, 6>
1154where
1155    EvaluatableUsize<{ metadata_size_bytes(K, 6) }>: Sized,
1156    [(); 1 << K]:,
1157    [(); num_buckets(K)]:,
1158    [(); num_buckets(K) - 1]:,
1159{
1160}
1161#[cfg(feature = "alloc")]
1162impl<const K: u8> private::SupportedOtherTables for Table<K, 7>
1163where
1164    EvaluatableUsize<{ metadata_size_bytes(K, 7) }>: Sized,
1165    [(); 1 << K]:,
1166    [(); num_buckets(K)]:,
1167    [(); num_buckets(K) - 1]:,
1168{
1169}
1170
1171#[cfg(feature = "alloc")]
1172impl<const K: u8> private::NotLastTable for Table<K, 1>
1173where
1174    EvaluatableUsize<{ metadata_size_bytes(K, 1) }>: Sized,
1175    [(); 1 << K]:,
1176    [(); num_buckets(K)]:,
1177    [(); num_buckets(K) - 1]:,
1178{
1179}
1180#[cfg(feature = "alloc")]
1181impl<const K: u8> private::NotLastTable for Table<K, 2>
1182where
1183    EvaluatableUsize<{ metadata_size_bytes(K, 2) }>: Sized,
1184    [(); 1 << K]:,
1185    [(); num_buckets(K)]:,
1186    [(); num_buckets(K) - 1]:,
1187{
1188}
1189#[cfg(feature = "alloc")]
1190impl<const K: u8> private::NotLastTable for Table<K, 3>
1191where
1192    EvaluatableUsize<{ metadata_size_bytes(K, 3) }>: Sized,
1193    [(); 1 << K]:,
1194    [(); num_buckets(K)]:,
1195    [(); num_buckets(K) - 1]:,
1196{
1197}
1198#[cfg(feature = "alloc")]
1199impl<const K: u8> private::NotLastTable for Table<K, 4>
1200where
1201    EvaluatableUsize<{ metadata_size_bytes(K, 4) }>: Sized,
1202    [(); 1 << K]:,
1203    [(); num_buckets(K)]:,
1204    [(); num_buckets(K) - 1]:,
1205{
1206}
1207#[cfg(feature = "alloc")]
1208impl<const K: u8> private::NotLastTable for Table<K, 5>
1209where
1210    EvaluatableUsize<{ metadata_size_bytes(K, 5) }>: Sized,
1211    [(); 1 << K]:,
1212    [(); num_buckets(K)]:,
1213    [(); num_buckets(K) - 1]:,
1214{
1215}
1216#[cfg(feature = "alloc")]
1217impl<const K: u8> private::NotLastTable for Table<K, 6>
1218where
1219    EvaluatableUsize<{ metadata_size_bytes(K, 6) }>: Sized,
1220    [(); 1 << K]:,
1221    [(); num_buckets(K)]:,
1222    [(); num_buckets(K) - 1]:,
1223{
1224}
1225
1226#[cfg(feature = "alloc")]
1227impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1228where
1229    Self: private::SupportedOtherTables,
1230    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1231    [(); 1 << K]:,
1232    [(); num_buckets(K)]:,
1233    [(); num_buckets(K) - 1]:,
1234{
1235    /// Creates a new [`TABLE_NUMBER`] table. There also exists [`Self::create_parallel()`] that
1236    /// trades CPU efficiency and memory usage for lower latency and with multiple parallel calls,
1237    /// better overall performance.
1238    pub(super) fn create<const PARENT_TABLE_NUMBER: u8>(
1239        parent_table: Table<K, PARENT_TABLE_NUMBER>,
1240        cache: &TablesCache,
1241    ) -> (Self, PrunedTable<K, PARENT_TABLE_NUMBER>)
1242    where
1243        Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
1244        EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
1245    {
1246        let left_targets = &*cache.left_targets;
1247        let mut initialized_elements = 0_usize;
1248        // SAFETY: Contents is `MaybeUninit`
1249        let mut ys = unsafe { Box::<[MaybeUninit<Y>; 1 << K]>::new_uninit().assume_init() };
1250        // SAFETY: Contents is `MaybeUninit`
1251        let mut positions =
1252            unsafe { Box::<[MaybeUninit<[Position; 2]>; 1 << K]>::new_uninit().assume_init() };
1253        // The last table doesn't have metadata
1254        // SAFETY: Contents is `MaybeUninit`
1255        let mut metadatas = unsafe {
1256            Box::<[MaybeUninit<Metadata<K, TABLE_NUMBER>>; 1 << K]>::new_uninit().assume_init()
1257        };
1258
1259        for ([left_bucket, right_bucket], left_bucket_index) in
1260            parent_table.buckets().array_windows().zip(0..)
1261        {
1262            let mut matches = [MaybeUninit::uninit(); _];
1263            // SAFETY: Positions are taken from `Table::buckets()` and correspond to initialized
1264            // values
1265            let matches = unsafe {
1266                find_matches_in_buckets(
1267                    left_bucket_index,
1268                    left_bucket,
1269                    right_bucket,
1270                    &mut matches,
1271                    left_targets,
1272                )
1273            };
1274            // Throw away some successful matches that are not that necessary
1275            let matches = &matches[..matches.len().min(REDUCED_MATCHES_COUNT)];
1276            // SAFETY: Already initialized this many elements
1277            let (ys, positions, metadatas) = unsafe {
1278                (
1279                    ys.split_at_mut_unchecked(initialized_elements).1,
1280                    positions.split_at_mut_unchecked(initialized_elements).1,
1281                    metadatas.split_at_mut_unchecked(initialized_elements).1,
1282                )
1283            };
1284
1285            // SAFETY: Preallocated length is an upper bound and is always sufficient
1286            let (ys, positions, metadatas) = unsafe {
1287                (
1288                    ys.split_at_mut_unchecked(matches.len()).0,
1289                    positions.split_at_mut_unchecked(matches.len()).0,
1290                    metadatas.split_at_mut_unchecked(matches.len()).0,
1291                )
1292            };
1293
1294            // SAFETY: Matches come from the parent table and the size of `ys`, `positions`
1295            // and `metadatas` is the same as the number of matches
1296            unsafe {
1297                matches_to_results(&parent_table, matches, ys, positions, metadatas);
1298            }
1299
1300            initialized_elements += matches.len();
1301        }
1302
1303        let parent_table = parent_table.prune();
1304
1305        // SAFETY: Converting a boxed array to a vector of the same size, which has the same memory
1306        // layout, the number of elements matches the number of elements that were initialized
1307        let ys = unsafe {
1308            let ys_len = ys.len();
1309            let ys = Box::into_raw(ys);
1310            Vec::from_raw_parts(ys.cast(), initialized_elements, ys_len)
1311        };
1312
1313        // TODO: Try to group buckets in the process of collecting `y`s
1314        let mut buckets = group_by_buckets::<K>(&ys);
1315        sort_buckets::<K>(&mut buckets);
1316
1317        let table = Self::Other {
1318            positions,
1319            metadatas,
1320            buckets,
1321        };
1322
1323        (table, parent_table)
1324    }
1325
1326    /// Almost the same as [`Self::create()`], but uses parallelism internally for better
1327    /// performance (though not efficiency of CPU and memory usage), if you create multiple tables
1328    /// in parallel, prefer [`Self::create_parallel()`] for better overall performance.
1329    #[cfg(feature = "parallel")]
1330    pub(super) fn create_parallel<const PARENT_TABLE_NUMBER: u8>(
1331        parent_table: Table<K, PARENT_TABLE_NUMBER>,
1332        cache: &TablesCache,
1333    ) -> (Self, PrunedTable<K, PARENT_TABLE_NUMBER>)
1334    where
1335        Table<K, PARENT_TABLE_NUMBER>: private::NotLastTable,
1336        EvaluatableUsize<{ metadata_size_bytes(K, PARENT_TABLE_NUMBER) }>: Sized,
1337    {
1338        // SAFETY: Contents is `MaybeUninit`
1339        let ys = unsafe {
1340            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1341        };
1342        // SAFETY: Contents is `MaybeUninit`
1343        let positions = unsafe {
1344            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1345        };
1346        // SAFETY: Contents is `MaybeUninit`
1347        let metadatas = unsafe {
1348            Box::<[SyncUnsafeCell<[MaybeUninit<_>; REDUCED_MATCHES_COUNT]>; num_buckets(K) - 1]>::new_uninit().assume_init()
1349        };
1350        let global_results_counts =
1351            array::from_fn::<_, { num_buckets(K) - 1 }, _>(|_| SyncUnsafeCell::new(0u16));
1352
1353        let left_targets = &*cache.left_targets;
1354
1355        let buckets = parent_table.buckets();
1356        // Iterate over buckets in batches, such that a cache line worth of bytes is taken from
1357        // `global_results_counts` each time to avoid unnecessary false sharing
1358        let bucket_batch_size = CACHE_LINE_SIZE / size_of::<u16>();
1359        let bucket_batch_index = AtomicUsize::new(0);
1360
1361        rayon::broadcast(|_ctx| {
1362            loop {
1363                let bucket_batch_index = bucket_batch_index.fetch_add(1, Ordering::Relaxed);
1364
1365                let buckets_batch = buckets
1366                    .array_windows::<2>()
1367                    .enumerate()
1368                    .skip(bucket_batch_index * bucket_batch_size)
1369                    .take(bucket_batch_size);
1370
1371                if buckets_batch.is_empty() {
1372                    break;
1373                }
1374
1375                for (left_bucket_index, [left_bucket, right_bucket]) in buckets_batch {
1376                    let mut matches = [MaybeUninit::uninit(); _];
1377                    // SAFETY: Positions are taken from `Table::buckets()` and correspond to
1378                    // initialized values
1379                    let matches = unsafe {
1380                        find_matches_in_buckets(
1381                            left_bucket_index as u32,
1382                            left_bucket,
1383                            right_bucket,
1384                            &mut matches,
1385                            left_targets,
1386                        )
1387                    };
1388                    // Throw away some successful matches that are not that necessary
1389                    let matches = &matches[..matches.len().min(REDUCED_MATCHES_COUNT)];
1390
1391                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1392                    // at this time, and it is guaranteed to be in range
1393                    let ys = unsafe { &mut *ys.get_unchecked(left_bucket_index).get() };
1394                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1395                    // at this time, and it is guaranteed to be in range
1396                    let positions =
1397                        unsafe { &mut *positions.get_unchecked(left_bucket_index).get() };
1398                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1399                    // at this time, and it is guaranteed to be in range
1400                    let metadatas =
1401                        unsafe { &mut *metadatas.get_unchecked(left_bucket_index).get() };
1402                    // SAFETY: This is the only place where `left_bucket_index`'s entry is accessed
1403                    // at this time, and it is guaranteed to be in range
1404                    let count = unsafe {
1405                        &mut *global_results_counts.get_unchecked(left_bucket_index).get()
1406                    };
1407
1408                    // SAFETY: Matches come from the parent table and the size of `ys`, `positions`
1409                    // and `metadatas` is larger or equal to the number of matches
1410                    unsafe {
1411                        matches_to_results::<_, TABLE_NUMBER, _>(
1412                            &parent_table,
1413                            matches,
1414                            ys,
1415                            positions,
1416                            metadatas,
1417                        )
1418                    };
1419                    *count = matches.len() as u16;
1420                }
1421            }
1422        });
1423
1424        let parent_table = parent_table.prune();
1425
1426        let ys = strip_sync_unsafe_cell(ys);
1427        let positions = strip_sync_unsafe_cell(positions);
1428        let metadatas = strip_sync_unsafe_cell(metadatas);
1429
1430        // TODO: Try to group buckets in the process of collecting `y`s
1431        // SAFETY: `global_results_counts` corresponds to the number of initialized `ys`
1432        let mut buckets = unsafe {
1433            group_by_buckets_from_buckets::<K, _>(
1434                ys.iter().zip(
1435                    global_results_counts
1436                        .into_iter()
1437                        .map(|count| usize::from(count.into_inner())),
1438                ),
1439            )
1440        };
1441        sort_buckets::<K>(&mut buckets);
1442
1443        let table = Self::OtherBuckets {
1444            positions,
1445            metadatas,
1446            buckets,
1447        };
1448
1449        (table, parent_table)
1450    }
1451
1452    /// Get `[left_position, right_position]` of a previous table for a specified position in a
1453    /// current table.
1454    ///
1455    /// # Safety
1456    /// `position` must come from [`Self::buckets()`] or [`Self::position()`] or
1457    /// [`PrunedTable::position()`] and not be a sentinel value.
1458    #[inline(always)]
1459    pub(super) unsafe fn position(&self, position: Position) -> [Position; 2] {
1460        match self {
1461            Self::First { .. } => {
1462                unreachable!("Not the first table");
1463            }
1464            Self::Other { positions, .. } => {
1465                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1466                unsafe { positions.get_unchecked(usize::from(position)).assume_init() }
1467            }
1468            #[cfg(feature = "parallel")]
1469            Self::OtherBuckets { positions, .. } => {
1470                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1471                unsafe {
1472                    positions
1473                        .as_flattened()
1474                        .get_unchecked(usize::from(position))
1475                        .assume_init()
1476                }
1477            }
1478        }
1479    }
1480}
1481
1482#[cfg(feature = "alloc")]
1483impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1484where
1485    Self: private::NotLastTable,
1486    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1487    [(); 1 << K]:,
1488    [(); num_buckets(K)]:,
1489    [(); num_buckets(K) - 1]:,
1490{
1491    /// Returns `None` for an invalid position or for table number 7.
1492    ///
1493    /// # Safety
1494    /// `position` must come from [`Self::buckets()`] and not be a sentinel value.
1495    #[inline(always)]
1496    unsafe fn metadata(&self, position: Position) -> Metadata<K, TABLE_NUMBER> {
1497        match self {
1498            Self::First { .. } => {
1499                // X matches position
1500                Metadata::from(X::from(u32::from(position)))
1501            }
1502            Self::Other { metadatas, .. } => {
1503                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1504                unsafe { metadatas.get_unchecked(usize::from(position)).assume_init() }
1505            }
1506            #[cfg(feature = "parallel")]
1507            Self::OtherBuckets { metadatas, .. } => {
1508                // SAFETY: All non-sentinel positions returned by [`Self::buckets()`] are valid
1509                unsafe {
1510                    metadatas
1511                        .as_flattened()
1512                        .get_unchecked(usize::from(position))
1513                        .assume_init()
1514                }
1515            }
1516        }
1517    }
1518}
1519
1520#[cfg(feature = "alloc")]
1521impl<const K: u8, const TABLE_NUMBER: u8> Table<K, TABLE_NUMBER>
1522where
1523    EvaluatableUsize<{ metadata_size_bytes(K, TABLE_NUMBER) }>: Sized,
1524    [(); 1 << K]:,
1525    [(); num_buckets(K)]:,
1526    [(); num_buckets(K) - 1]:,
1527{
1528    #[inline(always)]
1529    fn prune(self) -> PrunedTable<K, TABLE_NUMBER> {
1530        match self {
1531            Self::First { .. } => PrunedTable::First,
1532            Self::Other { positions, .. } => PrunedTable::Other { positions },
1533            #[cfg(feature = "parallel")]
1534            Self::OtherBuckets { positions, .. } => PrunedTable::OtherBuckets { positions },
1535        }
1536    }
1537
1538    /// Positions of `y`s grouped by the bucket they belong to
1539    #[inline(always)]
1540    pub(super) fn buckets(&self) -> &[[(Position, Y); REDUCED_BUCKET_SIZE]; num_buckets(K)] {
1541        match self {
1542            Self::First { buckets, .. } => buckets,
1543            Self::Other { buckets, .. } => buckets,
1544            #[cfg(feature = "parallel")]
1545            Self::OtherBuckets { buckets, .. } => buckets,
1546        }
1547    }
1548}