shuf: handle --input-range with huge number ranges

This requires special handling, because we cannot always generate all possible strings beforehand, e.g. in the case of "-n 2 -i 0-2147483647".
2025-09-14 11:07:59 +00:00 · 2024-02-16 20:48:12 +01:00 · 2024-02-16 20:48:12 +01:00 · 352a8a5bd7
commit 352a8a5bd7
parent 989178f438
2 changed files with 293 additions and 5 deletions
--- a/src/uu/shuf/src/shuf.rs
+++ b/src/uu/shuf/src/shuf.rs
@ -3,12 +3,13 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
-// spell-checker:ignore (ToDO) cmdline evec seps shufable rvec fdata
+// spell-checker:ignore (ToDO) cmdline evec nonrepeating seps shufable rvec fdata
 use clap::{crate_version, Arg, ArgAction, Command};
 use memchr::memchr_iter;
 use rand::prelude::SliceRandom;
-use rand::RngCore;
+use rand::{Rng, RngCore};
 use std::collections::HashSet;
 use std::fs::File;
 use std::io::{stdin, stdout, BufReader, BufWriter, Error, Read, Write};
 use uucore::display::Quotable;
@ -119,9 +120,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
            shuf_exec(&mut evec, options)?;
        }
        Mode::InputRange((b, e)) => {
-            let rvec = (b..e).map(|x| format!("{x}")).collect::<Vec<String>>();
+            shuf_exec(&mut (b, e), options)?;
            let mut rvec = rvec.iter().map(String::as_bytes).collect::<Vec<&[u8]>>();
            shuf_exec(&mut rvec, options)?;
        }
        Mode::Default(filename) => {
            let fdata = read_input_file(&filename)?;
@ -290,6 +289,117 @@ impl<'a> Shufable for Vec<&'a [u8]> {
    }
 }
 impl Shufable for (usize, usize) {
    type Item = usize;
    fn is_empty(&self) -> bool {
        // Note: This is an inclusive range, so equality means there is 1 element.
        self.0 > self.1
    }
    fn choose(&self, rng: &mut WrappedRng) -> usize {
        rng.gen_range(self.0..self.1)
    }
    type PartialShuffleIterator<'b> = NonrepeatingIterator<'b> where Self: 'b;
    fn partial_shuffle<'b>(
        &'b mut self,
        rng: &'b mut WrappedRng,
        amount: usize,
    ) -> Self::PartialShuffleIterator<'b> {
        NonrepeatingIterator::new(self.0, self.1, rng, amount)
    }
 }
 enum NumberSet {
    AlreadyListed(HashSet<usize>),
    Remaining(Vec<usize>),
 }
 struct NonrepeatingIterator<'a> {
    begin: usize,
    end: usize, // exclusive
    rng: &'a mut WrappedRng,
    remaining_count: usize,
    buf: NumberSet,
 }
 impl<'a> NonrepeatingIterator<'a> {
    fn new(
        begin: usize,
        end: usize,
        rng: &'a mut WrappedRng,
        amount: usize,
    ) -> NonrepeatingIterator {
        let capped_amount = if begin > end {
            0
        } else {
            amount.min(end - begin)
        };
        NonrepeatingIterator {
            begin,
            end,
            rng,
            remaining_count: capped_amount,
            buf: NumberSet::AlreadyListed(HashSet::default()),
        }
    }
    fn produce(&mut self) -> usize {
        debug_assert!(self.begin <= self.end);
        match &mut self.buf {
            NumberSet::AlreadyListed(already_listed) => {
                let chosen = loop {
                    let guess = self.rng.gen_range(self.begin..self.end);
                    let newly_inserted = already_listed.insert(guess);
                    if newly_inserted {
                        break guess;
                    }
                };
                // Once a significant fraction of the interval has already been enumerated,
                // the number of attempts to find a number that hasn't been chosen yet increases.
                // Therefore, we need to switch at some point from "set of already returned values" to "list of remaining values".
                let range_size = self.end - self.begin;
                if number_set_should_list_remaining(already_listed.len(), range_size) {
                    let mut remaining = (self.begin..self.end)
                        .filter(|n| !already_listed.contains(n))
                        .collect::<Vec<_>>();
                    assert!(remaining.len() >= self.remaining_count);
                    remaining.partial_shuffle(&mut self.rng, self.remaining_count);
                    remaining.truncate(self.remaining_count);
                    self.buf = NumberSet::Remaining(remaining);
                }
                chosen
            }
            NumberSet::Remaining(remaining_numbers) => {
                debug_assert!(!remaining_numbers.is_empty());
                // We only enter produce() when there is at least one actual element remaining, so popping must always return an element.
                remaining_numbers.pop().unwrap()
            }
        }
    }
 }
 impl<'a> Iterator for NonrepeatingIterator<'a> {
    type Item = usize;
    fn next(&mut self) -> Option<usize> {
        if self.begin > self.end || self.remaining_count == 0 {
            return None;
        }
        self.remaining_count -= 1;
        Some(self.produce())
    }
 }
 // This could be a method, but it is much easier to test as a stand-alone function.
 fn number_set_should_list_remaining(listed_count: usize, range_size: usize) -> bool {
    // Arbitrarily determine the switchover point to be around 25%. This is because:
    // - HashSet has a large space overhead for the hash table load factor.
    // - This means that somewhere between 25-40%, the memory required for a "positive" HashSet and a "negative" Vec should be the same.
    // - HashSet has a small but non-negligible overhead for each lookup, so we have a slight preference for Vec anyway.
    // - At 25%, on average 1.33 attempts are needed to find a number that hasn't been taken yet.
    // - Finally, "24%" is computationally the simplest:
    listed_count >= range_size / 4
 }
 trait Writable {
    fn write_all_to(&self, output: &mut impl Write) -> Result<(), Error>;
 }
@ -412,3 +522,88 @@ impl RngCore for WrappedRng {
        }
    }
 }
 #[cfg(test)]
 // Since the computed value is a bool, it is more readable to write the expected value out:
 #[allow(clippy::bool_assert_comparison)]
 mod test_number_set_decision {
    use super::number_set_should_list_remaining;
    #[test]
    fn test_stay_positive_large_remaining_first() {
        assert_eq!(false, number_set_should_list_remaining(0, std::usize::MAX));
    }
    #[test]
    fn test_stay_positive_large_remaining_second() {
        assert_eq!(false, number_set_should_list_remaining(1, std::usize::MAX));
    }
    #[test]
    fn test_stay_positive_large_remaining_tenth() {
        assert_eq!(false, number_set_should_list_remaining(9, std::usize::MAX));
    }
    #[test]
    fn test_stay_positive_smallish_range_first() {
        assert_eq!(false, number_set_should_list_remaining(0, 12345));
    }
    #[test]
    fn test_stay_positive_smallish_range_second() {
        assert_eq!(false, number_set_should_list_remaining(1, 12345));
    }
    #[test]
    fn test_stay_positive_smallish_range_tenth() {
        assert_eq!(false, number_set_should_list_remaining(9, 12345));
    }
    #[test]
    fn test_stay_positive_small_range_not_too_early() {
        assert_eq!(false, number_set_should_list_remaining(1, 10));
    }
    // Don't want to test close to the border, in case we decide to change the threshold.
    // However, at 50% coverage, we absolutely should switch:
    #[test]
    fn test_switch_half() {
        assert_eq!(true, number_set_should_list_remaining(1234, 2468));
    }
    // Ensure that the decision is monotonous:
    #[test]
    fn test_switch_late1() {
        assert_eq!(true, number_set_should_list_remaining(12340, 12345));
    }
    #[test]
    fn test_switch_late2() {
        assert_eq!(true, number_set_should_list_remaining(12344, 12345));
    }
    // Ensure that we are overflow-free:
    #[test]
    fn test_no_crash_exceed_max_size1() {
        assert_eq!(
            false,
            number_set_should_list_remaining(12345, std::usize::MAX)
        );
    }
    #[test]
    fn test_no_crash_exceed_max_size2() {
        assert_eq!(
            true,
            number_set_should_list_remaining(std::usize::MAX - 1, std::usize::MAX)
        );
    }
    #[test]
    fn test_no_crash_exceed_max_size3() {
        assert_eq!(
            true,
            number_set_should_list_remaining(std::usize::MAX, std::usize::MAX)
        );
    }
 }
--- a/tests/by-util/test_shuf.rs
+++ b/tests/by-util/test_shuf.rs
@ -88,6 +88,99 @@ fn test_zero_termination_multi() {
    assert_eq!(result_seq, input_seq, "Output is not a permutation");
 }
 #[test]
 fn test_very_large_range() {
    let num_samples = 10;
    let result = new_ucmd!()
        .arg("-n")
        .arg(&num_samples.to_string())
        .arg("-i0-1234567890")
        .succeeds();
    result.no_stderr();
    let result_seq: Vec<isize> = result
        .stdout_str()
        .split('\n')
        .filter(|x| !x.is_empty())
        .map(|x| x.parse().unwrap())
        .collect();
    assert_eq!(result_seq.len(), num_samples, "Miscounted output length!");
    assert!(
        result_seq.iter().all(|x| (0..=1234567890).contains(x)),
        "Output includes element not from range: {}",
        result.stdout_str()
    );
 }
 #[test]
 fn test_very_large_range_offset() {
    let num_samples = 10;
    let result = new_ucmd!()
        .arg("-n")
        .arg(&num_samples.to_string())
        .arg("-i1234567890-2147483647")
        .succeeds();
    result.no_stderr();
    let result_seq: Vec<isize> = result
        .stdout_str()
        .split('\n')
        .filter(|x| !x.is_empty())
        .map(|x| x.parse().unwrap())
        .collect();
    assert_eq!(result_seq.len(), num_samples, "Miscounted output length!");
    assert!(
        result_seq
            .iter()
            .all(|x| (1234567890..=2147483647).contains(x)),
        "Output includes element not from range: {}",
        result.stdout_str()
    );
 }
 #[test]
 fn test_very_high_range_full() {
    let input_seq = vec![
        2147483641, 2147483642, 2147483643, 2147483644, 2147483645, 2147483646, 2147483647,
    ];
    let result = new_ucmd!().arg("-i2147483641-2147483647").succeeds();
    result.no_stderr();
    let mut result_seq: Vec<isize> = result
        .stdout_str()
        .split('\n')
        .filter(|x| !x.is_empty())
        .map(|x| x.parse().unwrap())
        .collect();
    result_seq.sort_unstable();
    assert_eq!(result_seq, input_seq, "Output is not a permutation");
 }
 #[test]
 fn test_range_repeat() {
    let num_samples = 500;
    let result = new_ucmd!()
        .arg("-r")
        .arg("-n")
        .arg(&num_samples.to_string())
        .arg("-i12-34")
        .succeeds();
    result.no_stderr();
    let result_seq: Vec<isize> = result
        .stdout_str()
        .split('\n')
        .filter(|x| !x.is_empty())
        .map(|x| x.parse().unwrap())
        .collect();
    assert_eq!(result_seq.len(), num_samples, "Miscounted output length!");
    assert!(
        result_seq.iter().all(|x| (12..=34).contains(x)),
        "Output includes element not from range: {}",
        result.stdout_str()
    );
 }
 #[test]
 fn test_empty_input() {
    let result = new_ucmd!().pipe_in(vec![]).succeeds();