subspace_farmer/single_disk_farm/
direct_io_file.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
//! Wrapper data structure for direct/unbuffered I/O

use parking_lot::Mutex;
use static_assertions::const_assert_eq;
use std::fs::{File, OpenOptions};
use std::path::Path;
use std::{io, mem};
use subspace_farmer_components::file_ext::{FileExt, OpenOptionsExt};
use subspace_farmer_components::ReadAtSync;

/// 4096 is as a relatively safe size due to sector size on SSDs commonly being 512 or 4096 bytes
pub const DISK_SECTOR_SIZE: usize = 4096;
/// Restrict how much data to read from disk in a single call to avoid very large memory usage
const MAX_READ_SIZE: usize = 1024 * 1024;

const_assert_eq!(MAX_READ_SIZE % DISK_SECTOR_SIZE, 0);

#[derive(Debug, Copy, Clone)]
#[repr(C, align(4096))]
struct AlignedSectorSize([u8; DISK_SECTOR_SIZE]);

const_assert_eq!(align_of::<AlignedSectorSize>(), DISK_SECTOR_SIZE);

impl Default for AlignedSectorSize {
    fn default() -> Self {
        Self([0; DISK_SECTOR_SIZE])
    }
}

impl AlignedSectorSize {
    fn slice_mut_to_repr(slice: &mut [Self]) -> &mut [[u8; DISK_SECTOR_SIZE]] {
        // SAFETY: `AlignedSectorSize` is `#[repr(C)]` and its alignment is larger than inner value
        unsafe { mem::transmute(slice) }
    }
}

/// Wrapper data structure for direct/unbuffered I/O
#[derive(Debug)]
pub struct DirectIoFile {
    file: File,
    /// Scratch buffer of aligned memory for reads and writes
    scratch_buffer: Mutex<Vec<AlignedSectorSize>>,
}

impl ReadAtSync for DirectIoFile {
    #[inline]
    fn read_at(&self, buf: &mut [u8], offset: u64) -> io::Result<()> {
        self.read_exact_at(buf, offset)
    }
}

impl ReadAtSync for &DirectIoFile {
    #[inline]
    fn read_at(&self, buf: &mut [u8], offset: u64) -> io::Result<()> {
        (*self).read_at(buf, offset)
    }
}

impl FileExt for DirectIoFile {
    fn size(&self) -> io::Result<u64> {
        Ok(self.file.metadata()?.len())
    }

    fn preallocate(&self, len: u64) -> io::Result<()> {
        self.file.preallocate(len)
    }

    fn advise_random_access(&self) -> io::Result<()> {
        // Ignore, already set
        Ok(())
    }

    fn advise_sequential_access(&self) -> io::Result<()> {
        // Ignore, not supported
        Ok(())
    }

    fn disable_cache(&self) -> io::Result<()> {
        // Ignore, not supported
        Ok(())
    }

    fn read_exact_at(&self, buf: &mut [u8], mut offset: u64) -> io::Result<()> {
        if buf.is_empty() {
            return Ok(());
        }

        let mut scratch_buffer = self.scratch_buffer.lock();

        // First read up to `MAX_READ_SIZE - padding`
        let padding = (offset % DISK_SECTOR_SIZE as u64) as usize;
        let first_unaligned_chunk_size = (MAX_READ_SIZE - padding).min(buf.len());
        let (unaligned_start, buf) = buf.split_at_mut(first_unaligned_chunk_size);
        {
            let bytes_to_read = unaligned_start.len();
            unaligned_start.copy_from_slice(self.read_exact_at_internal(
                &mut scratch_buffer,
                bytes_to_read,
                offset,
            )?);
            offset += unaligned_start.len() as u64;
        }

        if buf.is_empty() {
            return Ok(());
        }

        // Process the rest of the chunks, up to `MAX_READ_SIZE` at a time
        for buf in buf.chunks_mut(MAX_READ_SIZE) {
            let bytes_to_read = buf.len();
            buf.copy_from_slice(self.read_exact_at_internal(
                &mut scratch_buffer,
                bytes_to_read,
                offset,
            )?);
            offset += buf.len() as u64;
        }

        Ok(())
    }

    fn write_all_at(&self, buf: &[u8], mut offset: u64) -> io::Result<()> {
        if buf.is_empty() {
            return Ok(());
        }

        let mut scratch_buffer = self.scratch_buffer.lock();

        // First write up to `MAX_READ_SIZE - padding`
        let padding = (offset % DISK_SECTOR_SIZE as u64) as usize;
        let first_unaligned_chunk_size = (MAX_READ_SIZE - padding).min(buf.len());
        let (unaligned_start, buf) = buf.split_at(first_unaligned_chunk_size);
        {
            self.write_all_at_internal(&mut scratch_buffer, unaligned_start, offset)?;
            offset += unaligned_start.len() as u64;
        }

        if buf.is_empty() {
            return Ok(());
        }

        // Process the rest of the chunks, up to `MAX_READ_SIZE` at a time
        for buf in buf.chunks(MAX_READ_SIZE) {
            self.write_all_at_internal(&mut scratch_buffer, buf, offset)?;
            offset += buf.len() as u64;
        }

        Ok(())
    }
}

impl DirectIoFile {
    /// Open file at specified path for direct/unbuffered I/O for reads (if file doesn't exist, it
    /// will be created).
    ///
    /// This is especially important on Windows to prevent huge memory usage.
    pub fn open<P>(path: P) -> io::Result<Self>
    where
        P: AsRef<Path>,
    {
        let mut open_options = OpenOptions::new();
        open_options.use_direct_io();
        let file = open_options
            .read(true)
            .write(true)
            .create(true)
            .truncate(false)
            .open(path)?;

        file.disable_cache()?;

        Ok(Self {
            file,
            // In many cases we'll want to read this much at once, so pre-allocate it right away
            scratch_buffer: Mutex::new(vec![
                AlignedSectorSize::default();
                MAX_READ_SIZE / DISK_SECTOR_SIZE
            ]),
        })
    }

    /// Truncates or extends the underlying file, updating the size of this file to become `size`.
    pub fn set_len(&self, size: u64) -> io::Result<()> {
        self.file.set_len(size)
    }

    fn read_exact_at_internal<'a>(
        &self,
        scratch_buffer: &'a mut Vec<AlignedSectorSize>,
        bytes_to_read: usize,
        offset: u64,
    ) -> io::Result<&'a [u8]> {
        let aligned_offset = offset / DISK_SECTOR_SIZE as u64 * DISK_SECTOR_SIZE as u64;
        let padding = (offset - aligned_offset) as usize;

        // Make scratch buffer of a size that is necessary to read aligned memory, accounting
        // for extra bytes at the beginning and the end that will be thrown away
        let desired_buffer_size = (padding + bytes_to_read).div_ceil(DISK_SECTOR_SIZE);
        if scratch_buffer.len() < desired_buffer_size {
            scratch_buffer.resize_with(desired_buffer_size, AlignedSectorSize::default);
        }
        let scratch_buffer = AlignedSectorSize::slice_mut_to_repr(scratch_buffer)
            [..desired_buffer_size]
            .as_flattened_mut();

        self.file.read_exact_at(scratch_buffer, aligned_offset)?;

        Ok(&scratch_buffer[padding..][..bytes_to_read])
    }

    /// Panics on writes over `MAX_READ_SIZE` (including padding on both ends)
    fn write_all_at_internal(
        &self,
        scratch_buffer: &mut Vec<AlignedSectorSize>,
        bytes_to_write: &[u8],
        offset: u64,
    ) -> io::Result<()> {
        // This is guaranteed by constructor
        assert!(
            AlignedSectorSize::slice_mut_to_repr(scratch_buffer)
                .as_flattened()
                .len()
                <= MAX_READ_SIZE
        );

        let aligned_offset = offset / DISK_SECTOR_SIZE as u64 * DISK_SECTOR_SIZE as u64;
        let padding = (offset - aligned_offset) as usize;

        // Calculate the size of the read including padding on both ends
        let bytes_to_read =
            (padding + bytes_to_write.len()).div_ceil(DISK_SECTOR_SIZE) * DISK_SECTOR_SIZE;

        if padding == 0 && bytes_to_read == bytes_to_write.len() {
            let scratch_buffer =
                AlignedSectorSize::slice_mut_to_repr(scratch_buffer).as_flattened_mut();
            let scratch_buffer = &mut scratch_buffer[..bytes_to_read];
            scratch_buffer.copy_from_slice(bytes_to_write);
            self.file.write_all_at(scratch_buffer, offset)?;
        } else {
            // Read whole pages where `bytes_to_write` will be written
            self.read_exact_at_internal(scratch_buffer, bytes_to_read, aligned_offset)?;
            let scratch_buffer =
                AlignedSectorSize::slice_mut_to_repr(scratch_buffer).as_flattened_mut();
            let scratch_buffer = &mut scratch_buffer[..bytes_to_read];
            // Update contents of existing pages and write into the file
            scratch_buffer[padding..][..bytes_to_write.len()].copy_from_slice(bytes_to_write);
            self.file.write_all_at(scratch_buffer, aligned_offset)?;
        }

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use crate::single_disk_farm::direct_io_file::{DirectIoFile, MAX_READ_SIZE};
    use rand::prelude::*;
    use std::fs;
    use subspace_farmer_components::file_ext::FileExt;
    use tempfile::tempdir;

    #[test]
    fn basic() {
        let tempdir = tempdir().unwrap();
        let file_path = tempdir.as_ref().join("file.bin");
        let mut data = vec![0u8; MAX_READ_SIZE * 5];
        thread_rng().fill(data.as_mut_slice());
        fs::write(&file_path, &data).unwrap();

        let file = DirectIoFile::open(&file_path).unwrap();

        let mut buffer = Vec::new();
        for (offset, size) in [
            (0_usize, 512_usize),
            (0_usize, 4096_usize),
            (0, 500),
            (0, 4000),
            (5, 50),
            (12, 500),
            (96, 4000),
            (4000, 96),
            (10000, 5),
            (0, MAX_READ_SIZE),
            (0, MAX_READ_SIZE * 2),
            (5, MAX_READ_SIZE - 5),
            (5, MAX_READ_SIZE * 2 - 5),
            (5, MAX_READ_SIZE),
            (5, MAX_READ_SIZE * 2),
            (MAX_READ_SIZE, MAX_READ_SIZE),
            (MAX_READ_SIZE, MAX_READ_SIZE * 2),
            (MAX_READ_SIZE + 5, MAX_READ_SIZE - 5),
            (MAX_READ_SIZE + 5, MAX_READ_SIZE * 2 - 5),
            (MAX_READ_SIZE + 5, MAX_READ_SIZE),
            (MAX_READ_SIZE + 5, MAX_READ_SIZE * 2),
        ] {
            let data = &mut data[offset..][..size];
            buffer.resize(size, 0);
            // Read contents
            file.read_exact_at(buffer.as_mut_slice(), offset as u64)
                .unwrap_or_else(|error| panic!("Offset {offset}, size {size}: {error}"));

            // Ensure it is correct
            assert_eq!(data, buffer.as_slice(), "Offset {offset}, size {size}");

            // Update data with random contents and write
            thread_rng().fill(data);
            file.write_all_at(data, offset as u64)
                .unwrap_or_else(|error| panic!("Offset {offset}, size {size}: {error}"));

            // Read contents again
            file.read_exact_at(buffer.as_mut_slice(), offset as u64)
                .unwrap_or_else(|error| panic!("Offset {offset}, size {size}: {error}"));

            // Ensure it is correct too
            assert_eq!(data, buffer.as_slice(), "Offset {offset}, size {size}");
        }
    }
}