subspace_farmer/plotter/gpu/
cuda.rs

1//! CUDA GPU records encoder
2
3use crate::plotter::gpu::GpuRecordsEncoder;
4use async_lock::Mutex as AsyncMutex;
5use parking_lot::Mutex;
6use rayon::{ThreadPool, ThreadPoolBuildError, ThreadPoolBuilder, current_thread_index};
7use std::process::exit;
8use std::sync::Arc;
9use std::sync::atomic::{AtomicBool, Ordering};
10use subspace_core_primitives::pieces::{PieceOffset, Record};
11use subspace_core_primitives::sectors::SectorId;
12use subspace_farmer_components::plotting::RecordsEncoder;
13use subspace_farmer_components::sector::SectorContentsMap;
14use subspace_proof_of_space_gpu::cuda::CudaDevice;
15
16/// CUDA implementation of [`GpuRecordsEncoder`]
17#[derive(Debug)]
18pub struct CudaRecordsEncoder {
19    cuda_device: CudaDevice,
20    thread_pool: ThreadPool,
21    global_mutex: Arc<AsyncMutex<()>>,
22}
23
24impl GpuRecordsEncoder for CudaRecordsEncoder {
25    const TYPE: &'static str = "cuda";
26}
27
28impl RecordsEncoder for CudaRecordsEncoder {
29    fn encode_records(
30        &mut self,
31        sector_id: &SectorId,
32        records: &mut [Record],
33        abort_early: &AtomicBool,
34    ) -> anyhow::Result<SectorContentsMap> {
35        let pieces_in_sector = records
36            .len()
37            .try_into()
38            .map_err(|error| anyhow::anyhow!("Failed to convert pieces in sector: {error}"))?;
39        let mut sector_contents_map = SectorContentsMap::new(pieces_in_sector);
40
41        {
42            let iter = Mutex::new(
43                (PieceOffset::ZERO..)
44                    .zip(records.iter_mut())
45                    .zip(sector_contents_map.iter_record_bitfields_mut()),
46            );
47            let plotting_error = Mutex::new(None::<String>);
48
49            self.thread_pool.scope(|scope| {
50                scope.spawn_broadcast(|_scope, _ctx| {
51                    loop {
52                        // Take mutex briefly to make sure encoding is allowed right now
53                        self.global_mutex.lock_blocking();
54
55                        // This instead of `while` above because otherwise mutex will be held for the
56                        // duration of the loop and will limit concurrency to 1 record
57                        let Some(((piece_offset, record), mut encoded_chunks_used)) =
58                            iter.lock().next()
59                        else {
60                            return;
61                        };
62                        let pos_seed = sector_id.derive_evaluation_seed(piece_offset);
63
64                        if let Err(error) = self.cuda_device.generate_and_encode_pospace(
65                            &pos_seed,
66                            record,
67                            encoded_chunks_used.iter_mut(),
68                        ) {
69                            plotting_error.lock().replace(error);
70                            return;
71                        }
72
73                        if abort_early.load(Ordering::Relaxed) {
74                            return;
75                        }
76                    }
77                });
78            });
79
80            let plotting_error = plotting_error.lock().take();
81            if let Some(error) = plotting_error {
82                return Err(anyhow::Error::msg(error));
83            }
84        }
85
86        Ok(sector_contents_map)
87    }
88}
89
90impl CudaRecordsEncoder {
91    /// Create new instance
92    pub fn new(
93        cuda_device: CudaDevice,
94        global_mutex: Arc<AsyncMutex<()>>,
95    ) -> Result<Self, ThreadPoolBuildError> {
96        let id = cuda_device.id();
97        let thread_name = move |thread_index| format!("cuda-{id}.{thread_index}");
98        // TODO: remove this panic handler when rayon logs panic_info
99        // https://github.com/rayon-rs/rayon/issues/1208
100        let panic_handler = move |panic_info| {
101            if let Some(index) = current_thread_index() {
102                eprintln!("panic on thread {}: {:?}", thread_name(index), panic_info);
103            } else {
104                // We want to guarantee exit, rather than panicking in a panic handler.
105                eprintln!("rayon panic handler called on non-rayon thread: {panic_info:?}");
106            }
107            exit(1);
108        };
109
110        let thread_pool = ThreadPoolBuilder::new()
111            .thread_name(thread_name)
112            .panic_handler(panic_handler)
113            // Make sure there is overlap between records, so GPU is almost always busy
114            .num_threads(2)
115            .build()?;
116
117        Ok(Self {
118            cuda_device,
119            thread_pool,
120            global_mutex,
121        })
122    }
123}