1use std::{
2 collections::HashMap,
3 env, io,
4 path::{Path, PathBuf},
5 process,
6 sync::{Arc, Mutex},
7 thread,
8};
9
10use crypto::{Hasher, sha2::Sha256};
11use tokio::{fs::File, io::AsyncReadExt, sync::Semaphore, task};
12use walkdir::WalkDir;
13
14#[tokio::main]
26async fn main() {
27 let args: Vec<String> = env::args().collect();
28 if args.len() < 2 {
29 eprintln!("usage: duplicates <folder1> [folder2 ...]");
30 process::exit(1);
31 }
32
33 let results: Arc<Mutex<HashMap<[u8; 32], (u64, Vec<PathBuf>)>>> = Arc::new(Mutex::new(HashMap::new()));
34 let semaphore = Arc::new(Semaphore::new(thread::available_parallelism().unwrap().get()));
35 let mut handles = Vec::with_capacity(args.len() - 1);
36
37 for arg in &args[1..] {
38 let dir = PathBuf::from(arg);
39 let results = Arc::clone(&results);
40 let semaphore = Arc::clone(&semaphore);
41 let handle = task::spawn(async move {
42 if let Err(e) = process_directory(&dir, &results, &semaphore).await {
43 eprintln!("error processing '{}': {e}", dir.display());
44 }
45 });
46 handles.push(handle);
47 }
48
49 for handle in handles {
50 let _ = handle.await;
51 }
52
53 let map = results.lock().unwrap();
54 let mut groups: Vec<(u64, &[u8; 32])> = map
55 .iter()
56 .filter(|(_, (_, paths))| paths.len() > 1)
57 .map(|(hash, (size, paths))| (size * paths.len() as u64, hash))
58 .collect();
59 groups.sort_by(|a, b| b.0.cmp(&a.0));
60
61 for (total, hash) in groups {
62 let (_, paths) = &map[hash];
63 println!("{}: {}:", format_size(total), hex::encode(hash));
64 let mut sorted = paths.clone();
65 sorted.sort();
66 for p in &sorted {
67 println!(" {}", p.display());
68 }
69 }
70}
71
72async fn process_directory(
73 dir: &Path,
74 results: &Arc<Mutex<HashMap<[u8; 32], (u64, Vec<PathBuf>)>>>,
75 semaphore: &Arc<Semaphore>,
76) -> Result<(), io::Error> {
77 let mut handles = vec![];
78
79 for entry in WalkDir::new(dir).follow_links(false) {
80 let entry = entry.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
81 if entry.file_type().is_file() && !entry.file_type().is_symlink() {
82 let path = entry.into_path();
83 let results = Arc::clone(results);
84 let semaphore = Arc::clone(semaphore);
85 let handle = task::spawn(async move {
86 let _permit = semaphore.acquire().await.unwrap();
87 match hash_file(&path).await {
88 Ok((hash, size)) => {
89 let mut map = results.lock().unwrap();
90 map.entry(hash).or_insert_with(|| (size, Vec::new())).1.push(path);
91 }
92 Err(e) => {
93 eprintln!("error hashing '{}': {e}", path.display());
94 }
95 }
96 });
97 handles.push(handle);
98 }
99 }
100
101 for handle in handles {
102 let _ = handle.await;
103 }
104
105 Ok(())
106}
107
108async fn hash_file(path: &Path) -> Result<([u8; 32], u64), io::Error> {
109 let mut file = File::open(path).await?;
110 let size = file.metadata().await?.len();
111 let mut hasher = Sha256::new();
112 let mut buf = vec![0u8; 1024 * 1024];
113 loop {
114 let n = file.read(&mut buf).await?;
115 if n == 0 {
116 break;
117 }
118 hasher.update(&buf[..n]);
119 }
120 let hash = hasher.sum();
121 Ok((hash.as_ref().try_into().unwrap(), size))
122}
123
124fn format_size(bytes: u64) -> String {
125 const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
126 let mut val = bytes as f64;
127 let mut idx = 0;
128 while val >= 1024.0 && idx < UNITS.len() - 1 {
129 val /= 1024.0;
130 idx += 1;
131 }
132 format!("{:.0}{}", val, UNITS[idx])
133}