mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
Merge pull request #5968 from anastygnome/tsort
tsort refactoring proposal
This commit is contained in:
commit
9a6f5521b9
2 changed files with 188 additions and 70 deletions
64
src/uu/tsort/BENCHMARKING.md
Normal file
64
src/uu/tsort/BENCHMARKING.md
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
# Benchmarking `tsort`
|
||||||
|
<!-- spell-checker:ignore (words) randint tsort DAG uu_tsort GNU -->
|
||||||
|
Much of what makes `tsort` fast is the efficiency of its algorithm and implementation for topological sorting.
|
||||||
|
Our implementation of `tsort` also outputs a cycle whenever such ordering does not exist, just like GNU `tsort`.
|
||||||
|
|
||||||
|
## Strategies
|
||||||
|
|
||||||
|
To test `tsort`'s performance for its nominal use case, we need to test it with a DAG. One of the worst cases is when all nodes are just representing a succession of independent steps.
|
||||||
|
We should also test cycle detection for good measure.
|
||||||
|
|
||||||
|
### Random acyclic graph (DAG)
|
||||||
|
|
||||||
|
This will output a DAG composed of 1 million pairs of edges between nodes numbered from 0 to 10,000, ensuring that the graph is acyclic by always assigning the edge with the smallest id to the node with the highest one.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import random
|
||||||
|
|
||||||
|
N = 10000
|
||||||
|
|
||||||
|
for i in range(100*N):
|
||||||
|
a = random.randint(0, N)
|
||||||
|
b = random.randint(0, N)
|
||||||
|
print(f"{min(a, b)} {max(a, b)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Random graph with cycles
|
||||||
|
|
||||||
|
The following will output a graph with multiples edges, it also allows some degree of tuning to test different cases.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import random
|
||||||
|
|
||||||
|
# Parameters for the graph
|
||||||
|
num_nodes = 100
|
||||||
|
num_edges = 150
|
||||||
|
cycle_percentage = 0.10
|
||||||
|
max_cycle_size = 6
|
||||||
|
|
||||||
|
num_cycles = int(num_edges * cycle_percentage)
|
||||||
|
|
||||||
|
for _ in range(num_edges - num_cycles):
|
||||||
|
a = random.randint(0, num_nodes)
|
||||||
|
b = random.randint(0, num_nodes)
|
||||||
|
print(f"{a} {b}")
|
||||||
|
|
||||||
|
|
||||||
|
for _ in range(num_cycles):
|
||||||
|
cycle_size = random.randint(3, max_cycle_size)
|
||||||
|
cycle_nodes = random.sample(range(num_nodes), cycle_size)
|
||||||
|
for i in range(cycle_size):
|
||||||
|
print(f"{cycle_nodes[i]} {cycle_nodes[(i + 1) % cycle_size]}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running Benchmarks
|
||||||
|
The above scripts will output the generated graphs to the standard output. They can therefore be used directly as tests. In order to run a Benchmark, the output should be redirected to a file.
|
||||||
|
Use [`hyperfine`](https://github.com/sharkdp/hyperfine) to compare the performance of different `tsort` versions. For example, you can compare the performance of GNU `tsort` and another implementation with the following command:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
hyperfine 'tsort random_graph.txt' 'uu_tsort random_graph.txt'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Note
|
||||||
|
|
||||||
|
Benchmark results from the above scripts are fuzzy and change from run to run unless a seed is set.
|
|
@ -2,8 +2,10 @@
|
||||||
//
|
//
|
||||||
// For the full copyright and license information, please view the LICENSE
|
// For the full copyright and license information, please view the LICENSE
|
||||||
// file that was distributed with this source code.
|
// file that was distributed with this source code.
|
||||||
|
//spell-checker:ignore TAOCP
|
||||||
use clap::{crate_version, Arg, Command};
|
use clap::{crate_version, Arg, Command};
|
||||||
use std::collections::{BTreeMap, BTreeSet};
|
use std::collections::{HashMap, HashSet, VecDeque};
|
||||||
|
use std::fmt::Write;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{stdin, BufReader, Read};
|
use std::io::{stdin, BufReader, Read};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
@ -45,7 +47,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
|
|
||||||
let mut input_buffer = String::new();
|
let mut input_buffer = String::new();
|
||||||
reader.read_to_string(&mut input_buffer)?;
|
reader.read_to_string(&mut input_buffer)?;
|
||||||
let mut g = Graph::new();
|
let mut g = Graph::default();
|
||||||
|
|
||||||
for line in input_buffer.lines() {
|
for line in input_buffer.lines() {
|
||||||
let tokens: Vec<_> = line.split_whitespace().collect();
|
let tokens: Vec<_> = line.split_whitespace().collect();
|
||||||
|
@ -68,22 +70,26 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
g.run_tsort();
|
match g.run_tsort() {
|
||||||
|
Err(cycle) => {
|
||||||
if !g.is_acyclic() {
|
let mut error_message = format!(
|
||||||
return Err(USimpleError::new(
|
"{}: {}: input contains a loop:\n",
|
||||||
1,
|
uucore::util_name(),
|
||||||
format!("{input}, input contains a loop:"),
|
input
|
||||||
));
|
);
|
||||||
|
for node in &cycle {
|
||||||
|
writeln!(error_message, "{}: {}", uucore::util_name(), node).unwrap();
|
||||||
|
}
|
||||||
|
eprint!("{}", error_message);
|
||||||
|
println!("{}", cycle.join("\n"));
|
||||||
|
Err(USimpleError::new(1, ""))
|
||||||
|
}
|
||||||
|
Ok(ordering) => {
|
||||||
|
println!("{}", ordering.join("\n"));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for x in &g.result {
|
|
||||||
println!("{x}");
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn uu_app() -> Command {
|
pub fn uu_app() -> Command {
|
||||||
Command::new(uucore::util_name())
|
Command::new(uucore::util_name())
|
||||||
.version(crate_version!())
|
.version(crate_version!())
|
||||||
|
@ -100,77 +106,125 @@ pub fn uu_app() -> Command {
|
||||||
|
|
||||||
// We use String as a representation of node here
|
// We use String as a representation of node here
|
||||||
// but using integer may improve performance.
|
// but using integer may improve performance.
|
||||||
|
|
||||||
|
struct Node<'input> {
|
||||||
|
successor_names: Vec<&'input str>,
|
||||||
|
predecessor_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'input> Node<'input> {
|
||||||
|
fn new() -> Self {
|
||||||
|
Node {
|
||||||
|
successor_names: Vec::new(),
|
||||||
|
predecessor_count: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn add_successor(&mut self, successor_name: &'input str) {
|
||||||
|
self.successor_names.push(successor_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct Graph<'input> {
|
struct Graph<'input> {
|
||||||
in_edges: BTreeMap<&'input str, BTreeSet<&'input str>>,
|
nodes: HashMap<&'input str, Node<'input>>,
|
||||||
out_edges: BTreeMap<&'input str, Vec<&'input str>>,
|
|
||||||
result: Vec<&'input str>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'input> Graph<'input> {
|
impl<'input> Graph<'input> {
|
||||||
fn new() -> Self {
|
fn add_node(&mut self, name: &'input str) {
|
||||||
Self::default()
|
self.nodes.entry(name).or_insert_with(Node::new);
|
||||||
}
|
|
||||||
|
|
||||||
fn has_node(&self, n: &str) -> bool {
|
|
||||||
self.in_edges.contains_key(n)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn has_edge(&self, from: &str, to: &str) -> bool {
|
|
||||||
self.in_edges[to].contains(from)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn init_node(&mut self, n: &'input str) {
|
|
||||||
self.in_edges.insert(n, BTreeSet::new());
|
|
||||||
self.out_edges.insert(n, vec![]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_edge(&mut self, from: &'input str, to: &'input str) {
|
fn add_edge(&mut self, from: &'input str, to: &'input str) {
|
||||||
if !self.has_node(to) {
|
self.add_node(from);
|
||||||
self.init_node(to);
|
if from != to {
|
||||||
}
|
self.add_node(to);
|
||||||
|
|
||||||
if !self.has_node(from) {
|
let from_node = self.nodes.get_mut(from).unwrap();
|
||||||
self.init_node(from);
|
from_node.add_successor(to);
|
||||||
}
|
|
||||||
|
|
||||||
if from != to && !self.has_edge(from, to) {
|
let to_node = self.nodes.get_mut(to).unwrap();
|
||||||
self.in_edges.get_mut(to).unwrap().insert(from);
|
to_node.predecessor_count += 1;
|
||||||
self.out_edges.get_mut(from).unwrap().push(to);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/// Implementation of algorithm T from TAOCP (Don. Knuth), vol. 1.
|
||||||
|
fn run_tsort(&mut self) -> Result<Vec<&'input str>, Vec<&'input str>> {
|
||||||
|
let mut result = Vec::with_capacity(self.nodes.len());
|
||||||
|
// First, we find a node that have no prerequisites (independent nodes)
|
||||||
|
// If no such node exists, then there is a cycle.
|
||||||
|
let mut independent_nodes_queue: VecDeque<&'input str> = self
|
||||||
|
.nodes
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(&name, node)| {
|
||||||
|
if node.predecessor_count == 0 {
|
||||||
|
Some(name)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
independent_nodes_queue.make_contiguous().sort_unstable(); // to make sure the resulting ordering is deterministic we need to order independent nodes
|
||||||
|
// FIXME: this doesn't comply entirely with the GNU coreutils implementation.
|
||||||
|
|
||||||
// Kahn's algorithm
|
// we remove each independent node, from the graph, updating each successor predecessor_count variable as we do.
|
||||||
// O(|V|+|E|)
|
while let Some(name_of_next_node_to_process) = independent_nodes_queue.pop_front() {
|
||||||
fn run_tsort(&mut self) {
|
result.push(name_of_next_node_to_process);
|
||||||
let mut start_nodes = vec![];
|
if let Some(node_to_process) = self.nodes.remove(name_of_next_node_to_process) {
|
||||||
for (n, edges) in &self.in_edges {
|
for successor_name in node_to_process.successor_names {
|
||||||
if edges.is_empty() {
|
let successor_node = self.nodes.get_mut(successor_name).unwrap();
|
||||||
start_nodes.push(*n);
|
successor_node.predecessor_count -= 1;
|
||||||
}
|
if successor_node.predecessor_count == 0 {
|
||||||
}
|
// if we find nodes without any other prerequisites, we add them to the queue.
|
||||||
|
independent_nodes_queue.push_back(successor_name);
|
||||||
while !start_nodes.is_empty() {
|
}
|
||||||
let n = start_nodes.remove(0);
|
|
||||||
|
|
||||||
self.result.push(n);
|
|
||||||
|
|
||||||
let n_out_edges = self.out_edges.get_mut(&n).unwrap();
|
|
||||||
#[allow(clippy::explicit_iter_loop)]
|
|
||||||
for m in n_out_edges.iter() {
|
|
||||||
let m_in_edges = self.in_edges.get_mut(m).unwrap();
|
|
||||||
m_in_edges.remove(&n);
|
|
||||||
|
|
||||||
// If m doesn't have other in-coming edges add it to start_nodes
|
|
||||||
if m_in_edges.is_empty() {
|
|
||||||
start_nodes.push(m);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n_out_edges.clear();
|
}
|
||||||
|
|
||||||
|
// if the graph has no cycle (it's a dependency tree), the graph should be empty now, as all nodes have been deleted.
|
||||||
|
if self.nodes.is_empty() {
|
||||||
|
Ok(result)
|
||||||
|
} else {
|
||||||
|
// otherwise, we detect and show a cycle to the user (as the GNU coreutils implementation does)
|
||||||
|
Err(self.detect_cycle())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_acyclic(&self) -> bool {
|
fn detect_cycle(&self) -> Vec<&'input str> {
|
||||||
self.out_edges.values().all(|edge| edge.is_empty())
|
let mut visited = HashSet::new();
|
||||||
|
let mut stack = Vec::with_capacity(self.nodes.len());
|
||||||
|
for &node in self.nodes.keys() {
|
||||||
|
if !visited.contains(node) && self.dfs(node, &mut visited, &mut stack) {
|
||||||
|
return stack;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unreachable!();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dfs(
|
||||||
|
&self,
|
||||||
|
node: &'input str,
|
||||||
|
visited: &mut HashSet<&'input str>,
|
||||||
|
stack: &mut Vec<&'input str>,
|
||||||
|
) -> bool {
|
||||||
|
if stack.contains(&node) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if visited.contains(&node) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
visited.insert(node);
|
||||||
|
stack.push(node);
|
||||||
|
|
||||||
|
if let Some(successor_names) = self.nodes.get(node).map(|n| &n.successor_names) {
|
||||||
|
for &successor in successor_names {
|
||||||
|
if self.dfs(successor, visited, stack) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stack.pop();
|
||||||
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue