Skip to main content

duplicates/
duplicates.rs

1use std::{
2    collections::HashMap,
3    env, io,
4    path::{Path, PathBuf},
5    process,
6    sync::{Arc, Mutex},
7    thread,
8};
9
10use crypto::{Hasher, sha2::Sha256};
11use tokio::{fs::File, io::AsyncReadExt, sync::Semaphore, task};
12use walkdir::WalkDir;
13
14/// Scans one or more directories for duplicate files and prints groups
15/// sorted by total disk space used (size × occurrence count).
16///
17/// Each group lists the human-readable total size, the SHA-256 hash, and
18/// every path sharing that hash.
19///
20/// # Errors
21///
22/// Exits with a non-zero status if fewer than one directory argument is
23/// provided. Individual file-hashing errors are printed to stderr and
24/// do not halt the scan.
25#[tokio::main]
26async fn main() {
27    let args: Vec<String> = env::args().collect();
28    if args.len() < 2 {
29        eprintln!("usage: duplicates <folder1> [folder2 ...]");
30        process::exit(1);
31    }
32
33    let results: Arc<Mutex<HashMap<[u8; 32], (u64, Vec<PathBuf>)>>> = Arc::new(Mutex::new(HashMap::new()));
34    let semaphore = Arc::new(Semaphore::new(thread::available_parallelism().unwrap().get()));
35    let mut handles = Vec::with_capacity(args.len() - 1);
36
37    for arg in &args[1..] {
38        let dir = PathBuf::from(arg);
39        let results = Arc::clone(&results);
40        let semaphore = Arc::clone(&semaphore);
41        let handle = task::spawn(async move {
42            if let Err(e) = process_directory(&dir, &results, &semaphore).await {
43                eprintln!("error processing '{}': {e}", dir.display());
44            }
45        });
46        handles.push(handle);
47    }
48
49    for handle in handles {
50        let _ = handle.await;
51    }
52
53    let map = results.lock().unwrap();
54    let mut groups: Vec<(u64, &[u8; 32])> = map
55        .iter()
56        .filter(|(_, (_, paths))| paths.len() > 1)
57        .map(|(hash, (size, paths))| (size * paths.len() as u64, hash))
58        .collect();
59    groups.sort_by(|a, b| b.0.cmp(&a.0));
60
61    for (total, hash) in groups {
62        let (_, paths) = &map[hash];
63        println!("{}: {}:", format_size(total), hex::encode(hash));
64        let mut sorted = paths.clone();
65        sorted.sort();
66        for p in &sorted {
67            println!("    {}", p.display());
68        }
69    }
70}
71
72async fn process_directory(
73    dir: &Path,
74    results: &Arc<Mutex<HashMap<[u8; 32], (u64, Vec<PathBuf>)>>>,
75    semaphore: &Arc<Semaphore>,
76) -> Result<(), io::Error> {
77    let mut handles = vec![];
78
79    for entry in WalkDir::new(dir).follow_links(false) {
80        let entry = entry.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
81        if entry.file_type().is_file() && !entry.file_type().is_symlink() {
82            let path = entry.into_path();
83            let results = Arc::clone(results);
84            let semaphore = Arc::clone(semaphore);
85            let handle = task::spawn(async move {
86                let _permit = semaphore.acquire().await.unwrap();
87                match hash_file(&path).await {
88                    Ok((hash, size)) => {
89                        let mut map = results.lock().unwrap();
90                        map.entry(hash).or_insert_with(|| (size, Vec::new())).1.push(path);
91                    }
92                    Err(e) => {
93                        eprintln!("error hashing '{}': {e}", path.display());
94                    }
95                }
96            });
97            handles.push(handle);
98        }
99    }
100
101    for handle in handles {
102        let _ = handle.await;
103    }
104
105    Ok(())
106}
107
108async fn hash_file(path: &Path) -> Result<([u8; 32], u64), io::Error> {
109    let mut file = File::open(path).await?;
110    let size = file.metadata().await?.len();
111    let mut hasher = Sha256::new();
112    let mut buf = vec![0u8; 1024 * 1024];
113    loop {
114        let n = file.read(&mut buf).await?;
115        if n == 0 {
116            break;
117        }
118        hasher.update(&buf[..n]);
119    }
120    let hash = hasher.sum();
121    Ok((hash.as_ref().try_into().unwrap(), size))
122}
123
124fn format_size(bytes: u64) -> String {
125    const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
126    let mut val = bytes as f64;
127    let mut idx = 0;
128    while val >= 1024.0 && idx < UNITS.len() - 1 {
129        val /= 1024.0;
130        idx += 1;
131    }
132    format!("{:.0}{}", val, UNITS[idx])
133}