summaryrefslogtreecommitdiffstats
path: root/library/std/src/sys/unix/kernel_copy.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /library/std/src/sys/unix/kernel_copy.rs
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'library/std/src/sys/unix/kernel_copy.rs')
-rw-r--r--library/std/src/sys/unix/kernel_copy.rs686
1 files changed, 686 insertions, 0 deletions
diff --git a/library/std/src/sys/unix/kernel_copy.rs b/library/std/src/sys/unix/kernel_copy.rs
new file mode 100644
index 000000000..8f7abb55e
--- /dev/null
+++ b/library/std/src/sys/unix/kernel_copy.rs
@@ -0,0 +1,686 @@
+//! This module contains specializations that can offload `io::copy()` operations on file descriptor
+//! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
+//!
+//! Specialization is only applied to wholly std-owned types so that user code can't observe
+//! that the `Read` and `Write` traits are not used.
+//!
+//! Since a copy operation involves a reader and writer side where each can consist of different types
+//! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
+//! a single method on all possible combinations.
+//!
+//! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
+//! traits and then specialized on by the `Copier::copy` method.
+//!
+//! `Copier` uses the specialization traits to unpack the underlying file descriptors and
+//! additional prerequisites and constraints imposed by the wrapper types.
+//!
+//! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
+//! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
+//! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
+//! Since those syscalls have requirements that cannot be fully checked in advance and
+//! gathering additional information about file descriptors would require additional syscalls
+//! anyway it simply attempts to use them one after another (guided by inaccurate hints) to
+//! figure out which one works and and falls back to the generic read-write copy loop if none of them
+//! does.
+//! Once a working syscall is found for a pair of file descriptors it will be called in a loop
+//! until the copy operation is completed.
+//!
+//! Advantages of using these syscalls:
+//!
+//! * fewer context switches since reads and writes are coalesced into a single syscall
+//! and more bytes are transferred per syscall. This translates to higher throughput
+//! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
+//! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
+//! consuming less disk space
+//! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
+//! a naive copy loop would move every byte through the CPU.
+//!
+//! Drawbacks:
+//!
+//! * copy operations smaller than the default buffer size can under some circumstances, especially
+//! on older kernels, incur more syscalls than the naive approach would. As mentioned above
+//! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
+//! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
+//! progress, they can hit a performance cliff.
+//! * complexity
+
+use crate::cmp::min;
+use crate::fs::{File, Metadata};
+use crate::io::copy::generic_copy;
+use crate::io::{
+ BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
+ Write,
+};
+use crate::mem::ManuallyDrop;
+use crate::net::TcpStream;
+use crate::os::unix::fs::FileTypeExt;
+use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
+use crate::os::unix::net::UnixStream;
+use crate::process::{ChildStderr, ChildStdin, ChildStdout};
+use crate::ptr;
+use crate::sync::atomic::{AtomicBool, AtomicU8, Ordering};
+use crate::sys::cvt;
+use crate::sys::weak::syscall;
+use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
+
+#[cfg(test)]
+mod tests;
+
+pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
+ read: &mut R,
+ write: &mut W,
+) -> Result<u64> {
+ let copier = Copier { read, write };
+ SpecCopy::copy(copier)
+}
+
+/// This type represents either the inferred `FileType` of a `RawFd` based on the source
+/// type from which it was extracted or the actual metadata
+///
+/// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
+/// type may be wrong.
+enum FdMeta {
+ /// We obtained the FD from a type that can contain any type of `FileType` and queried the metadata
+ /// because it is cheaper than probing all possible syscalls (reader side)
+ Metadata(Metadata),
+ Socket,
+ Pipe,
+ /// We don't have any metadata, e.g. because the original type was `File` which can represent
+ /// any `FileType` and we did not query the metadata either since it did not seem beneficial
+ /// (writer side)
+ NoneObtained,
+}
+
+impl FdMeta {
+ fn maybe_fifo(&self) -> bool {
+ match self {
+ FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
+ FdMeta::Socket => false,
+ FdMeta::Pipe => true,
+ FdMeta::NoneObtained => true,
+ }
+ }
+
+ fn potential_sendfile_source(&self) -> bool {
+ match self {
+ // procfs erroneously shows 0 length on non-empty readable files.
+ // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
+ // thus there would be benefit from attempting sendfile
+ FdMeta::Metadata(meta)
+ if meta.file_type().is_file() && meta.len() > 0
+ || meta.file_type().is_block_device() =>
+ {
+ true
+ }
+ _ => false,
+ }
+ }
+
+ fn copy_file_range_candidate(&self) -> bool {
+ match self {
+ // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
+ // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
+ FdMeta::Metadata(meta) if meta.is_file() && meta.len() > 0 => true,
+ FdMeta::NoneObtained => true,
+ _ => false,
+ }
+ }
+}
+
+struct CopyParams(FdMeta, Option<RawFd>);
+
+struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
+ read: &'a mut R,
+ write: &'b mut W,
+}
+
+trait SpecCopy {
+ fn copy(self) -> Result<u64>;
+}
+
+impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
+ default fn copy(self) -> Result<u64> {
+ generic_copy(self.read, self.write)
+ }
+}
+
+impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
+ fn copy(self) -> Result<u64> {
+ let (reader, writer) = (self.read, self.write);
+ let r_cfg = reader.properties();
+ let w_cfg = writer.properties();
+
+ // before direct operations on file descriptors ensure that all source and sink buffers are empty
+ let mut flush = || -> crate::io::Result<u64> {
+ let bytes = reader.drain_to(writer, u64::MAX)?;
+ // BufWriter buffered bytes have already been accounted for in earlier write() calls
+ writer.flush()?;
+ Ok(bytes)
+ };
+
+ let mut written = 0u64;
+
+ if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
+ (r_cfg, w_cfg)
+ {
+ written += flush()?;
+ let max_write = reader.min_limit();
+
+ if input_meta.copy_file_range_candidate() && output_meta.copy_file_range_candidate() {
+ let result = copy_regular_files(readfd, writefd, max_write);
+ result.update_take(reader);
+
+ match result {
+ CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
+ CopyResult::Error(e, _) => return Err(e),
+ CopyResult::Fallback(bytes) => written += bytes,
+ }
+ }
+
+ // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
+ // to any writable file descriptor. On older kernels the writer side can only be a socket.
+ // So we just try and fallback if needed.
+ // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
+ // fall back to the generic copy loop.
+ if input_meta.potential_sendfile_source() {
+ let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
+ result.update_take(reader);
+
+ match result {
+ CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
+ CopyResult::Error(e, _) => return Err(e),
+ CopyResult::Fallback(bytes) => written += bytes,
+ }
+ }
+
+ if input_meta.maybe_fifo() || output_meta.maybe_fifo() {
+ let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
+ result.update_take(reader);
+
+ match result {
+ CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
+ CopyResult::Error(e, _) => return Err(e),
+ CopyResult::Fallback(0) => { /* use the fallback below */ }
+ CopyResult::Fallback(_) => {
+ unreachable!("splice should not return > 0 bytes on the fallback path")
+ }
+ }
+ }
+ }
+
+ // fallback if none of the more specialized syscalls wants to work with these file descriptors
+ match generic_copy(reader, writer) {
+ Ok(bytes) => Ok(bytes + written),
+ err => err,
+ }
+ }
+}
+
+#[rustc_specialization_trait]
+trait CopyRead: Read {
+ /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
+ /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
+ /// transferred, whichever occurs sooner.
+ /// If nested buffers are present the outer buffers must be drained first.
+ ///
+ /// This is necessary to directly bypass the wrapper types while preserving the data order
+ /// when operating directly on the underlying file descriptors.
+ fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
+ Ok(0)
+ }
+
+ /// Updates `Take` wrappers to remove the number of bytes copied.
+ fn taken(&mut self, _bytes: u64) {}
+
+ /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
+ /// This method does not account for data `BufReader` buffers and would underreport
+ /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
+ /// after draining the buffers via `drain_to`.
+ fn min_limit(&self) -> u64 {
+ u64::MAX
+ }
+
+ /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
+ fn properties(&self) -> CopyParams;
+}
+
+#[rustc_specialization_trait]
+trait CopyWrite: Write {
+ /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
+ fn properties(&self) -> CopyParams;
+}
+
+impl<T> CopyRead for &mut T
+where
+ T: CopyRead,
+{
+ fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
+ (**self).drain_to(writer, limit)
+ }
+
+ fn taken(&mut self, bytes: u64) {
+ (**self).taken(bytes);
+ }
+
+ fn min_limit(&self) -> u64 {
+ (**self).min_limit()
+ }
+
+ fn properties(&self) -> CopyParams {
+ (**self).properties()
+ }
+}
+
+impl<T> CopyWrite for &mut T
+where
+ T: CopyWrite,
+{
+ fn properties(&self) -> CopyParams {
+ (**self).properties()
+ }
+}
+
+impl CopyRead for File {
+ fn properties(&self) -> CopyParams {
+ CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for &File {
+ fn properties(&self) -> CopyParams {
+ CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for File {
+ fn properties(&self) -> CopyParams {
+ CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for &File {
+ fn properties(&self) -> CopyParams {
+ CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for TcpStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for &TcpStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for TcpStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for &TcpStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for UnixStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for &UnixStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for UnixStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for &UnixStream {
+ fn properties(&self) -> CopyParams {
+ // avoid the stat syscall since we can be fairly sure it's a socket
+ CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for ChildStdin {
+ fn properties(&self) -> CopyParams {
+ CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for ChildStdout {
+ fn properties(&self) -> CopyParams {
+ CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for ChildStderr {
+ fn properties(&self) -> CopyParams {
+ CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyRead for StdinLock<'_> {
+ fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
+ let buf_reader = self.as_mut_buf();
+ let buf = buf_reader.buffer();
+ let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
+ let bytes_drained = buf.len();
+ writer.write_all(buf)?;
+ buf_reader.consume(bytes_drained);
+
+ Ok(bytes_drained as u64)
+ }
+
+ fn properties(&self) -> CopyParams {
+ CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for StdoutLock<'_> {
+ fn properties(&self) -> CopyParams {
+ CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
+ }
+}
+
+impl CopyWrite for StderrLock<'_> {
+ fn properties(&self) -> CopyParams {
+ CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
+ }
+}
+
+impl<T: CopyRead> CopyRead for Take<T> {
+ fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
+ let local_limit = self.limit();
+ let combined_limit = min(outer_limit, local_limit);
+ let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?;
+ // update limit since read() was bypassed
+ self.set_limit(local_limit - bytes_drained);
+
+ Ok(bytes_drained)
+ }
+
+ fn taken(&mut self, bytes: u64) {
+ self.set_limit(self.limit() - bytes);
+ self.get_mut().taken(bytes);
+ }
+
+ fn min_limit(&self) -> u64 {
+ min(Take::limit(self), self.get_ref().min_limit())
+ }
+
+ fn properties(&self) -> CopyParams {
+ self.get_ref().properties()
+ }
+}
+
+impl<T: CopyRead> CopyRead for BufReader<T> {
+ fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
+ let buf = self.buffer();
+ let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
+ let bytes = buf.len();
+ writer.write_all(buf)?;
+ self.consume(bytes);
+
+ let remaining = outer_limit - bytes as u64;
+
+ // in case of nested bufreaders we also need to drain the ones closer to the source
+ let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
+
+ Ok(bytes as u64 + inner_bytes)
+ }
+
+ fn taken(&mut self, bytes: u64) {
+ self.get_mut().taken(bytes);
+ }
+
+ fn min_limit(&self) -> u64 {
+ self.get_ref().min_limit()
+ }
+
+ fn properties(&self) -> CopyParams {
+ self.get_ref().properties()
+ }
+}
+
+impl<T: CopyWrite> CopyWrite for BufWriter<T> {
+ fn properties(&self) -> CopyParams {
+ self.get_ref().properties()
+ }
+}
+
+fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
+ let fd = fd.as_raw_fd();
+ let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
+ match file.metadata() {
+ Ok(meta) => FdMeta::Metadata(meta),
+ Err(_) => FdMeta::NoneObtained,
+ }
+}
+
+pub(super) enum CopyResult {
+ Ended(u64),
+ Error(Error, u64),
+ Fallback(u64),
+}
+
+impl CopyResult {
+ fn update_take(&self, reader: &mut impl CopyRead) {
+ match *self {
+ CopyResult::Fallback(bytes)
+ | CopyResult::Ended(bytes)
+ | CopyResult::Error(_, bytes) => reader.taken(bytes),
+ }
+ }
+}
+
+/// Invalid file descriptor.
+///
+/// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
+/// while negative values are used to indicate errors.
+/// Thus -1 will never be overlap with a valid open file.
+const INVALID_FD: RawFd = -1;
+
+/// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
+/// As the name says, it only works on regular files.
+///
+/// Callers must handle fallback to a generic copy loop.
+/// `Fallback` may indicate non-zero number of bytes already written
+/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
+pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
+ use crate::cmp;
+
+ const NOT_PROBED: u8 = 0;
+ const UNAVAILABLE: u8 = 1;
+ const AVAILABLE: u8 = 2;
+
+ // Kernel prior to 4.5 don't have copy_file_range
+ // We store the availability in a global to avoid unnecessary syscalls
+ static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED);
+
+ syscall! {
+ fn copy_file_range(
+ fd_in: libc::c_int,
+ off_in: *mut libc::loff_t,
+ fd_out: libc::c_int,
+ off_out: *mut libc::loff_t,
+ len: libc::size_t,
+ flags: libc::c_uint
+ ) -> libc::ssize_t
+ }
+
+ match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
+ NOT_PROBED => {
+ // EPERM can indicate seccomp filters or an immutable file.
+ // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported
+ // and some other error (ENOSYS or EPERM) if it's not available
+ let result = unsafe {
+ cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0))
+ };
+
+ if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) {
+ HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
+ } else {
+ HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed);
+ return CopyResult::Fallback(0);
+ }
+ }
+ UNAVAILABLE => return CopyResult::Fallback(0),
+ _ => {}
+ };
+
+ let mut written = 0u64;
+ while written < max_len {
+ let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
+ // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
+ // this allows us to copy large chunks without hitting EOVERFLOW,
+ // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
+ let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
+ let copy_result = unsafe {
+ // We actually don't have to adjust the offsets,
+ // because copy_file_range adjusts the file offset automatically
+ cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0))
+ };
+
+ match copy_result {
+ Ok(0) if written == 0 => {
+ // fallback to work around several kernel bugs where copy_file_range will fail to
+ // copy any bytes and return 0 instead of an error if
+ // - reading virtual files from the proc filesystem which appear to have 0 size
+ // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
+ // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
+ return CopyResult::Fallback(0);
+ }
+ Ok(0) => return CopyResult::Ended(written), // reached EOF
+ Ok(ret) => written += ret as u64,
+ Err(err) => {
+ return match err.raw_os_error() {
+ // when file offset + max_length > u64::MAX
+ Some(EOVERFLOW) => CopyResult::Fallback(written),
+ Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => {
+ // Try fallback io::copy if either:
+ // - Kernel version is < 4.5 (ENOSYS¹)
+ // - Files are mounted on different fs (EXDEV)
+ // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
+ // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
+ // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
+ // - the writer fd was opened with O_APPEND (EBADF²)
+ // and no bytes were written successfully yet. (All these errnos should
+ // not be returned if something was already written, but they happen in
+ // the wild, see #91152.)
+ //
+ // ¹ these cases should be detected by the initial probe but we handle them here
+ // anyway in case syscall interception changes during runtime
+ // ² actually invalid file descriptors would cause this too, but in that case
+ // the fallback code path is expected to encounter the same error again
+ CopyResult::Fallback(0)
+ }
+ _ => CopyResult::Error(err, written),
+ };
+ }
+ }
+ }
+ CopyResult::Ended(written)
+}
+
+#[derive(PartialEq)]
+enum SpliceMode {
+ Sendfile,
+ Splice,
+}
+
+/// performs splice or sendfile between file descriptors
+/// Does _not_ fall back to a generic copy loop.
+fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
+ static HAS_SENDFILE: AtomicBool = AtomicBool::new(true);
+ static HAS_SPLICE: AtomicBool = AtomicBool::new(true);
+
+ // Android builds use feature level 14, but the libc wrapper for splice is
+ // gated on feature level 21+, so we have to invoke the syscall directly.
+ #[cfg(target_os = "android")]
+ syscall! {
+ fn splice(
+ srcfd: libc::c_int,
+ src_offset: *const i64,
+ dstfd: libc::c_int,
+ dst_offset: *const i64,
+ len: libc::size_t,
+ flags: libc::c_int
+ ) -> libc::ssize_t
+ }
+
+ #[cfg(target_os = "linux")]
+ use libc::splice;
+
+ match mode {
+ SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
+ return CopyResult::Fallback(0);
+ }
+ SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
+ return CopyResult::Fallback(0);
+ }
+ _ => (),
+ }
+
+ let mut written = 0u64;
+ while written < len {
+ // according to its manpage that's the maximum size sendfile() will copy per invocation
+ let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
+
+ let result = match mode {
+ SpliceMode::Sendfile => {
+ cvt(unsafe { libc::sendfile(writer, reader, ptr::null_mut(), chunk_size) })
+ }
+ SpliceMode::Splice => cvt(unsafe {
+ splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
+ }),
+ };
+
+ match result {
+ Ok(0) => break, // EOF
+ Ok(ret) => written += ret as u64,
+ Err(err) => {
+ return match err.raw_os_error() {
+ Some(ENOSYS | EPERM) => {
+ // syscall not supported (ENOSYS)
+ // syscall is disallowed, e.g. by seccomp (EPERM)
+ match mode {
+ SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
+ SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
+ }
+ assert_eq!(written, 0);
+ CopyResult::Fallback(0)
+ }
+ Some(EINVAL) => {
+ // splice/sendfile do not support this particular file descriptor (EINVAL)
+ assert_eq!(written, 0);
+ CopyResult::Fallback(0)
+ }
+ Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
+ CopyResult::Fallback(written)
+ }
+ _ => CopyResult::Error(err, written),
+ };
+ }
+ }
+ }
+ CopyResult::Ended(written)
+}