From 054ca4a6b517cd86893a92362f152b7cde0071a1 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sun, 26 Nov 2023 17:01:22 -0500 Subject: [PATCH 01/11] wc: better handle files in pseudo-filesystems --- src/uu/wc/src/count_fast.rs | 26 +++++++++++++++++++++----- tests/by-util/test_wc.rs | 8 ++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 863625921..3872e60bf 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -2,6 +2,8 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. + +// cSpell:ignore sysconf use crate::word_count::WordCount; use super::WordCountable; @@ -11,7 +13,7 @@ use std::fs::OpenOptions; use std::io::{self, ErrorKind, Read}; #[cfg(unix)] -use libc::S_IFREG; +use libc::{sysconf, S_IFREG, _SC_PAGESIZE}; #[cfg(unix)] use nix::sys::stat; #[cfg(any(target_os = "linux", target_os = "android"))] @@ -87,11 +89,25 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // If stat.st_size = 0 then // - either the size is 0 // - or the size is unknown. - // The second case happens for files in pseudo-filesystems. For - // example with /proc/version and /sys/kernel/profiling. So, - // if it is 0 we don't report that and instead do a full read. + // The second case happens for files in pseudo-filesystems. + // For example with /proc/version. + // So, if it is 0 we don't report that and instead do a full read. + // + // Another thing to consider for files in pseudo-filesystems like /proc, /sys + // and similar is that they could report `st_size` greater than actual content. + // For example /sys/kernel/profiling could report `st_size` equal to + // system page size (typically 4096 on 64bit system), while it's file content + // would count up only to a couple of bytes. + // This condition usually occurs for files in pseudo-filesystems like /proc, /sys + // that report `st_size` in the multiples of system page size. + // In such cases - fall back on full read if (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 { - return (stat.st_size as usize, None); + let sys_page_size = unsafe { sysconf(_SC_PAGESIZE) as usize }; + if stat.st_size as usize % sys_page_size > 0 { + // regular file or file from /proc, /sys and similar pseudo-filesystems + // with size that is NOT a multiple of system page size + return (stat.st_size as usize, None); + } } #[cfg(any(target_os = "linux", target_os = "android"))] { diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs index 8358a542a..c365b13b9 100644 --- a/tests/by-util/test_wc.rs +++ b/tests/by-util/test_wc.rs @@ -419,6 +419,14 @@ fn test_files_from_pseudo_filesystem() { use pretty_assertions::assert_ne; let result = new_ucmd!().arg("-c").arg("/proc/cpuinfo").succeeds(); assert_ne!(result.stdout_str(), "0 /proc/cpuinfo\n"); + + let (at, mut ucmd) = at_and_ucmd!(); + let result = ucmd.arg("-c").arg("/sys/kernel/profiling").succeeds(); + let actual = at.read("/sys/kernel/profiling").len(); + assert_eq!( + result.stdout_str(), + format!("{} /sys/kernel/profiling\n", actual) + ); } #[test] From 253926f2e2ae39613db744d3a5f0f15671c93b8a Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 1 Dec 2023 16:20:25 -0500 Subject: [PATCH 02/11] wc: unix input redirect --- src/uu/wc/src/count_fast.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 3872e60bf..5fa1a978f 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -101,7 +101,26 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // This condition usually occurs for files in pseudo-filesystems like /proc, /sys // that report `st_size` in the multiples of system page size. // In such cases - fall back on full read - if (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 { + // + // And finally a special case of input redirection in *nix shell: + // `( wc -c ; wc -c ) < file` should return + // ``` + // size_of_file + // 0 + // ``` + // Similarly + // `( head -c1 ; wc -c ) < file` should return + // ``` + // first_byte_of_file + // size_of_file - 1 + // ``` + // Since the input stream from file is treated as continuous across both commands inside (). + // In cases like this, due to `<` redirect, the `stat.st_mode` would report input as a regular file + // and `stat.st_size` would report the size of file on disk + // and NOT the remaining number of bytes in the input stream. The raw file descriptor + // in this situation would be equal to `0` for STDIN in both invocations. + // Therefore we cannot rely of `st_size` here and should fall back on full read. + if fd > 0 && (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 { let sys_page_size = unsafe { sysconf(_SC_PAGESIZE) as usize }; if stat.st_size as usize % sys_page_size > 0 { // regular file or file from /proc, /sys and similar pseudo-filesystems From 6186153a0856eae9b631a82c3b9b05666405b953 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 1 Dec 2023 19:19:42 -0500 Subject: [PATCH 03/11] wc: count_fast optimization using seek --- src/uu/wc/src/count_fast.rs | 8 ++++++-- src/uu/wc/src/countable.rs | 9 +++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 5fa1a978f..1eb1d9127 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -10,7 +10,7 @@ use super::WordCountable; #[cfg(any(target_os = "linux", target_os = "android"))] use std::fs::OpenOptions; -use std::io::{self, ErrorKind, Read}; +use std::io::{self, ErrorKind, Read, Seek, SeekFrom}; #[cfg(unix)] use libc::{sysconf, S_IFREG, _SC_PAGESIZE}; @@ -100,7 +100,7 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // would count up only to a couple of bytes. // This condition usually occurs for files in pseudo-filesystems like /proc, /sys // that report `st_size` in the multiples of system page size. - // In such cases - fall back on full read + // In such cases - attempt `seek()` for the end of file // // And finally a special case of input redirection in *nix shell: // `( wc -c ; wc -c ) < file` should return @@ -126,6 +126,10 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // regular file or file from /proc, /sys and similar pseudo-filesystems // with size that is NOT a multiple of system page size return (stat.st_size as usize, None); + } else if let Some(file) = handle.inner_file() { + if let Ok(n) = file.seek(SeekFrom::End(0)) { + return (n as usize, None); + } } } #[cfg(any(target_os = "linux", target_os = "android"))] diff --git a/src/uu/wc/src/countable.rs b/src/uu/wc/src/countable.rs index 643974464..d27c7fb59 100644 --- a/src/uu/wc/src/countable.rs +++ b/src/uu/wc/src/countable.rs @@ -17,12 +17,14 @@ use std::os::unix::io::AsRawFd; pub trait WordCountable: AsRawFd + Read { type Buffered: BufRead; fn buffered(self) -> Self::Buffered; + fn inner_file(&mut self) -> Option<&mut File>; } #[cfg(not(unix))] pub trait WordCountable: Read { type Buffered: BufRead; fn buffered(self) -> Self::Buffered; + fn inner_file(&mut self) -> Option<&mut File>; } impl WordCountable for StdinLock<'_> { @@ -31,6 +33,9 @@ impl WordCountable for StdinLock<'_> { fn buffered(self) -> Self::Buffered { self } + fn inner_file(&mut self) -> Option<&mut File> { + None + } } impl WordCountable for File { @@ -39,4 +44,8 @@ impl WordCountable for File { fn buffered(self) -> Self::Buffered { BufReader::new(self) } + + fn inner_file(&mut self) -> Option<&mut File> { + Some(self) + } } From 85e78376fe932e168b6a891b829d55394330014b Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 1 Dec 2023 20:18:15 -0500 Subject: [PATCH 04/11] wc: count_fast seek optimization --- src/uu/wc/src/count_fast.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 1eb1d9127..1d3102f99 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -100,7 +100,8 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // would count up only to a couple of bytes. // This condition usually occurs for files in pseudo-filesystems like /proc, /sys // that report `st_size` in the multiples of system page size. - // In such cases - attempt `seek()` for the end of file + // In such cases - attempt `seek()` almost to the end of the file + // and then fall back on read to count the rest. // // And finally a special case of input redirection in *nix shell: // `( wc -c ; wc -c ) < file` should return @@ -127,8 +128,9 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // with size that is NOT a multiple of system page size return (stat.st_size as usize, None); } else if let Some(file) = handle.inner_file() { - if let Ok(n) = file.seek(SeekFrom::End(0)) { - return (n as usize, None); + let offset = stat.st_size - stat.st_size % (stat.st_blksize as i64 + 1); + if let Ok(n) = file.seek(SeekFrom::Start(offset as u64)) { + byte_count = n as usize; } } } From 9c4d88009d1990fc52126eee43155310a104c9c3 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 1 Dec 2023 20:49:49 -0500 Subject: [PATCH 05/11] wc: clippy --- src/uu/wc/src/count_fast.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 1d3102f99..0d9396740 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -10,12 +10,14 @@ use super::WordCountable; #[cfg(any(target_os = "linux", target_os = "android"))] use std::fs::OpenOptions; -use std::io::{self, ErrorKind, Read, Seek, SeekFrom}; +use std::io::{self, ErrorKind, Read}; #[cfg(unix)] use libc::{sysconf, S_IFREG, _SC_PAGESIZE}; #[cfg(unix)] use nix::sys::stat; +#[cfg(unix)] +use std::io::{Seek, SeekFrom}; #[cfg(any(target_os = "linux", target_os = "android"))] use std::os::unix::io::AsRawFd; @@ -128,7 +130,12 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // with size that is NOT a multiple of system page size return (stat.st_size as usize, None); } else if let Some(file) = handle.inner_file() { + // On some platforms `stat.st_blksize` is of i32 type, + // i.e. MacOS on Apple Silicon (aarch64-apple-darwin), + // as well as Debian Linux on ARM (aarch64-unknown-linux-gnu) + #[allow(clippy::unnecessary_cast)] let offset = stat.st_size - stat.st_size % (stat.st_blksize as i64 + 1); + if let Ok(n) = file.seek(SeekFrom::Start(offset as u64)) { byte_count = n as usize; } From 9ff7b42d832fdc21ef20b02608e9623a4be50f8d Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 1 Dec 2023 21:00:39 -0500 Subject: [PATCH 06/11] wc: stat casting --- src/uu/wc/src/count_fast.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 0d9396740..09b48b21e 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -130,11 +130,12 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // with size that is NOT a multiple of system page size return (stat.st_size as usize, None); } else if let Some(file) = handle.inner_file() { - // On some platforms `stat.st_blksize` is of i32 type, + // On some platforms `stat.st_blksize` and/or `st.st_size` is of i32 type, // i.e. MacOS on Apple Silicon (aarch64-apple-darwin), - // as well as Debian Linux on ARM (aarch64-unknown-linux-gnu) + // as well as Debian Linux on ARM (aarch64-unknown-linux-gnu), etc. #[allow(clippy::unnecessary_cast)] - let offset = stat.st_size - stat.st_size % (stat.st_blksize as i64 + 1); + let offset = + stat.st_size as i64 - stat.st_size as i64 % (stat.st_blksize as i64 + 1); if let Ok(n) = file.seek(SeekFrom::Start(offset as u64)) { byte_count = n as usize; From 54ac5a7e1ac804bc57496faf31877da82d5b4be4 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 2 Dec 2023 16:06:19 -0500 Subject: [PATCH 07/11] wc: count_fast windows optimization --- src/uu/wc/src/count_fast.rs | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 09b48b21e..59bde31c3 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -20,6 +20,12 @@ use nix::sys::stat; use std::io::{Seek, SeekFrom}; #[cfg(any(target_os = "linux", target_os = "android"))] use std::os::unix::io::AsRawFd; +#[cfg(windows)] +use std::os::windows::fs::MetadataExt; +#[cfg(windows)] +const FILE_ATTRIBUTE_ARCHIVE: u32 = 32; +#[cfg(windows)] +const FILE_ATTRIBUTE_NORMAL: u32 = 128; #[cfg(any(target_os = "linux", target_os = "android"))] use libc::S_IFIFO; @@ -76,6 +82,8 @@ fn count_bytes_using_splice(fd: &impl AsRawFd) -> Result { /// 1. On Unix, we can simply `stat` the file if it is regular. /// 2. On Linux -- if the above did not work -- we can use splice to count /// the number of bytes if the file is a FIFO. +/// 3. On Windows we can use `std::os::windows::fs::MetadataExt` to get file size +/// for regular files /// 3. Otherwise, we just read normally, but without the overhead of counting /// other things such as lines and words. #[inline] @@ -130,9 +138,12 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // with size that is NOT a multiple of system page size return (stat.st_size as usize, None); } else if let Some(file) = handle.inner_file() { - // On some platforms `stat.st_blksize` and/or `st.st_size` is of i32 type, + // On some platforms `stat.st_blksize` and `st.st_size` + // are of different types: i64 vs i32 // i.e. MacOS on Apple Silicon (aarch64-apple-darwin), - // as well as Debian Linux on ARM (aarch64-unknown-linux-gnu), etc. + // Debian Linux on ARM (aarch64-unknown-linux-gnu), + // 32bit i686 targets, etc. + // While on the others they are of the same type. #[allow(clippy::unnecessary_cast)] let offset = stat.st_size as i64 - stat.st_size as i64 % (stat.st_blksize as i64 + 1); @@ -156,6 +167,22 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti } } + #[cfg(windows)] + { + if let Some(file) = handle.inner_file() { + if let Ok(metadata) = file.metadata() { + let attributes = metadata.file_attributes(); + let size = metadata.file_size(); + + if (attributes & FILE_ATTRIBUTE_ARCHIVE) != 0 + || (attributes & FILE_ATTRIBUTE_NORMAL) != 0 + { + return (size as usize, None); + } + } + } + } + // Fall back on `read`, but without the overhead of counting words and lines. let mut buf = [0_u8; BUF_SIZE]; loop { From b7f708b23326d4914b84b0e363cdff35a3ab4507 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 2 Dec 2023 16:50:30 -0500 Subject: [PATCH 08/11] wc: comments --- src/uu/wc/src/count_fast.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 59bde31c3..d7875933d 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -128,8 +128,9 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // Since the input stream from file is treated as continuous across both commands inside (). // In cases like this, due to `<` redirect, the `stat.st_mode` would report input as a regular file // and `stat.st_size` would report the size of file on disk - // and NOT the remaining number of bytes in the input stream. The raw file descriptor - // in this situation would be equal to `0` for STDIN in both invocations. + // and NOT the remaining number of bytes in the input stream. + // However, the raw file descriptor in this situation would be equal to `0` + // for STDIN in both invocations. // Therefore we cannot rely of `st_size` here and should fall back on full read. if fd > 0 && (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 { let sys_page_size = unsafe { sysconf(_SC_PAGESIZE) as usize }; @@ -138,7 +139,7 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // with size that is NOT a multiple of system page size return (stat.st_size as usize, None); } else if let Some(file) = handle.inner_file() { - // On some platforms `stat.st_blksize` and `st.st_size` + // On some platforms `stat.st_blksize` and `stat.st_size` // are of different types: i64 vs i32 // i.e. MacOS on Apple Silicon (aarch64-apple-darwin), // Debian Linux on ARM (aarch64-unknown-linux-gnu), From a97b574fec16ad40d5f3ffc97b9e256d83c36718 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 2 Dec 2023 16:52:04 -0500 Subject: [PATCH 09/11] wc: comments --- src/uu/wc/src/count_fast.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index d7875933d..487893b9f 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -128,8 +128,8 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti // Since the input stream from file is treated as continuous across both commands inside (). // In cases like this, due to `<` redirect, the `stat.st_mode` would report input as a regular file // and `stat.st_size` would report the size of file on disk - // and NOT the remaining number of bytes in the input stream. - // However, the raw file descriptor in this situation would be equal to `0` + // and NOT the remaining number of bytes in the input stream. + // However, the raw file descriptor in this situation would be equal to `0` // for STDIN in both invocations. // Therefore we cannot rely of `st_size` here and should fall back on full read. if fd > 0 && (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 { From 967c539cc6ccd08fda8b6d1de0d49f45b4311f0d Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sun, 3 Dec 2023 16:00:18 -0500 Subject: [PATCH 10/11] wc: more tests --- tests/by-util/test_wc.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs index c365b13b9..9c55bd3bc 100644 --- a/tests/by-util/test_wc.rs +++ b/tests/by-util/test_wc.rs @@ -243,6 +243,14 @@ fn test_single_only_lines() { .stdout_is("18 moby_dick.txt\n"); } +#[test] +fn test_single_only_bytes() { + new_ucmd!() + .args(&["-c", "lorem_ipsum.txt"]) + .run() + .stdout_is("772 lorem_ipsum.txt\n"); +} + #[test] fn test_single_all_counts() { new_ucmd!() From 0076c9f64c37aad2033867487462beb342cf1e13 Mon Sep 17 00:00:00 2001 From: Yury Zhytkou <54360928+zhitkoff@users.noreply.github.com> Date: Thu, 7 Dec 2023 12:13:34 -0500 Subject: [PATCH 11/11] Update src/uu/wc/src/count_fast.rs Co-authored-by: Sylvestre Ledru --- src/uu/wc/src/count_fast.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 487893b9f..85d3d1d2b 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -173,12 +173,11 @@ pub(crate) fn count_bytes_fast(handle: &mut T) -> (usize, Opti if let Some(file) = handle.inner_file() { if let Ok(metadata) = file.metadata() { let attributes = metadata.file_attributes(); - let size = metadata.file_size(); if (attributes & FILE_ATTRIBUTE_ARCHIVE) != 0 || (attributes & FILE_ATTRIBUTE_NORMAL) != 0 { - return (size as usize, None); + return (metadata.file_size() as usize, None); } } }