summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile15
-rw-r--r--TODO5
-rw-r--r--hadori.132
-rw-r--r--hadori.C205
-rw-r--r--inode.h64
5 files changed, 321 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..736befb
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,15 @@
+LDFLAGS+=-lz -lboost_program_options
+CXXFLAGS?=-O2 -Wall
+CXXFLAGS+=-std=c++0x
+CPPFLAGS+=-D_FILE_OFFSET_BITS=64
+
+all: hadori
+
+hadori.1: hadori
+ help2man -n $< -o $@ -N --no-discard-stderr --version-string 0.1 ./$<
+
+hadori: hadori.o
+hadori.o: hadori.C inode.h
+
+clean:
+ rm -f hadori hadori.o
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..642824e
--- /dev/null
+++ b/TODO
@@ -0,0 +1,5 @@
+TODO/possible optimizations
+===========================
+
+* forget inode in to_link if link count is 1 before linking
+* more debug output when attributes mismatch in handle_file()
diff --git a/hadori.1 b/hadori.1
new file mode 100644
index 0000000..c1218b9
--- /dev/null
+++ b/hadori.1
@@ -0,0 +1,32 @@
+.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.40.4.
+.TH HADORI "1" "November 2011" "hadori 0.1" "User Commands"
+.SH NAME
+hadori \- hadori
+.SH DESCRIPTION
+Invocation: hadori [ OPTIONS ] [ ARGUMENTS ]
+OPTIONS:
+.TP
+\fB\-h\fR [ \fB\-\-help\fR ]
+print this help message
+.TP
+\fB\-t\fR [ \fB\-\-no\-time\fR ]
+ignore mtime
+.TP
+\fB\-\-hash\fR
+use adler32 hash to speed up comparing many files with
+same size and mostly identical content
+.TP
+\fB\-n\fR [ \fB\-\-dry\-run\fR ]
+don't change anything, implies \fB\-\-verbose\fR
+.TP
+\fB\-v\fR [ \fB\-\-verbose\fR ]
+show which files get linked
+.TP
+\fB\-d\fR [ \fB\-\-debug\fR ]
+show files being examined
+.TP
+\fB\-s\fR [ \fB\-\-stdin\fR ]
+read arguments from stdin, one per line
+.TP
+\fB\-0\fR [ \fB\-\-null\fR ]
+implies \fB\-\-stdin\fR, but use null bytes as delimiter
diff --git a/hadori.C b/hadori.C
new file mode 100644
index 0000000..fcdebd4
--- /dev/null
+++ b/hadori.C
@@ -0,0 +1,205 @@
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+
+#include <string>
+#include <vector>
+#include <queue>
+#include <map>
+#include <iostream>
+#include <sstream>
+
+#include <cstdlib>
+#include <cstring>
+#include <cerrno>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <sysexits.h>
+
+#include "inode.h"
+
+std::map<ino_t, inode const> kept;
+std::map<ino_t, ino_t> to_link;
+std::multimap<off_t, ino_t> sizes;
+po::variables_map config;
+std::ostream debug(std::clog.rdbuf()), verbose(std::clog.rdbuf()), error(std::clog.rdbuf());
+
+void do_link (inode const & i, std::string const & other) {
+ if (!link(i.filename.c_str(), other.c_str())) {
+ error << "linking " << i << " to " << other << " succeeded before unlinking (race condition)" << std::endl;
+ exit(EX_UNAVAILABLE);
+ }
+ if (errno != EEXIST) {
+ char * errstring = strerror(errno);
+ error << "error linking " << i << " to " << other << ": " << errstring << ", nothing bad happened." << std::endl;
+ exit(EX_UNAVAILABLE);
+ }
+ if (unlink(other.c_str())) {
+ char * errstring = strerror(errno);
+ error << "error unlinking " << other << " before linking " << i << " to it: " << errstring << std::endl;
+ exit(EX_UNAVAILABLE);
+ }
+ if (link(i.filename.c_str(), other.c_str())) {
+ char * errstring = strerror(errno);
+ error << "error linking " << i << " to " << other << ": " << errstring << ", destination filename was already unlinked." << std::endl;
+ exit(EX_UNAVAILABLE);
+ }
+}
+
+void handle_file(std::string const & path, struct stat const & s) {
+ debug << "examining " << path << std::endl;
+ if (kept.count(s.st_ino)) {
+ debug << "another link to inode " << s.st_ino << " that we keep" << std::endl;
+ return;
+ }
+ if (to_link.count(s.st_ino)) {
+ inode const & target = kept.find(to_link[s.st_ino])->second;
+ debug << "another link to inode " << s.st_ino << " that we merge with " << target << std::endl;
+ do_link(target, path);
+ return;
+ }
+ inode f(path, s);
+ debug << f << " is new to us" << std::endl;
+ for (auto it = sizes.lower_bound(s.st_size); it != sizes.upper_bound(s.st_size); ++it) {
+ inode const & candidate = kept.find(it->second)->second;
+ debug << "looking if it matches " << candidate << std::endl;
+ if (candidate.stat.st_mode != s.st_mode)
+ continue;
+ if (candidate.stat.st_uid != s.st_uid)
+ continue;
+ if (candidate.stat.st_gid != s.st_gid)
+ continue;
+ if (not config.count("no-time"))
+ if (candidate.stat.st_mtime != s.st_mtime)
+ continue;
+ if (config.count("hash"))
+ if (candidate.get_adler() != f.get_adler())
+ continue;
+ if (!compare(candidate, f))
+ continue;
+ verbose << "linking " << candidate << " to " << path << std::endl;
+ to_link.insert(std::make_pair(s.st_ino, it->second));
+ if (not config.count("dry-run"))
+ do_link(candidate, path);
+ return;
+ }
+ debug << "we keep " << f << std::endl;
+ kept.insert(std::make_pair(s.st_ino, f));
+ sizes.insert(std::make_pair(s.st_size, s.st_ino));
+}
+
+void recurse (std::string const & dir, dev_t const dev) {
+ DIR* D;
+ struct dirent *d;
+ struct stat s;
+ std::queue<std::string> subdirs;
+
+ if (!(D = opendir(dir.c_str()))) {
+ char * errstring = strerror(errno);
+ error << "opendir(\"" << dir << "\"): " << errstring << std::endl;
+ return;
+ }
+ while ((d = readdir(D))) {
+ std::string path(dir);
+ path += '/';
+ path += d->d_name;
+ if (lstat(path.c_str(), &s)) {
+ char * errstring = strerror(errno);
+ error << "lstat(\"" << path << "\"): " << errstring << std::endl;
+ continue;
+ }
+ if (s.st_dev != dev) {
+ error << path << " resides on another file system, ignoring." << std::endl;
+ continue;
+ }
+ if (S_ISDIR(s.st_mode))
+ subdirs.push(d->d_name);
+ if (S_ISREG(s.st_mode))
+ handle_file(path, s);
+ }
+ closedir(D);
+ // directories get handled after the parent dir is closed to prevent exhausting fds
+ for (; !subdirs.empty(); subdirs.pop()) {
+ if (subdirs.front() == "." || subdirs.front() == "..")
+ continue;
+ std::string subdir(dir);
+ subdir += '/';
+ subdir += subdirs.front();
+ recurse(subdir, dev);
+ }
+}
+
+void recurse_start (std::string const & dir) {
+ struct stat s;
+
+ if (lstat(dir.c_str(), &s)) {
+ char * errstring = strerror(errno);
+ error << "lstat(\"" << dir << "\"): " << errstring << std::endl;
+ exit(EX_NOINPUT);
+ }
+
+ static dev_t const dev = s.st_dev;
+ if (dev != s.st_dev) {
+ error << dir << " resides on another file system, ignoring." << std::endl;
+ return;
+ }
+
+ if (S_ISDIR(s.st_mode))
+ recurse(dir, dev);
+ if (S_ISREG(s.st_mode))
+ handle_file(dir, s);
+}
+
+int main (int const argc, char ** argv) {
+ po::options_description opts("OPTIONS");
+ opts.add_options()
+ ("help,h", "print this help message")
+ ("no-time,t", "ignore mtime")
+ ("hash", "use adler32 hash to speed up comparing many files with same size and mostly identical content")
+ ("dry-run,n", "don't change anything, implies --verbose")
+ ("verbose,v", "show which files get linked")
+ ("debug,d", "show files being examined")
+ ("stdin,s", "read arguments from stdin, one per line")
+ ("null,0", "implies --stdin, but use null bytes as delimiter")
+ ;
+ po::options_description all_opts;
+ all_opts.add(opts);
+ all_opts.add_options()
+ ("args", po::value< std::vector<std::string> >(), "files and directories to work on")
+ ;
+ po::positional_options_description pos_opts;
+ pos_opts.add("args", -1);
+ po::store(po::command_line_parser(argc, argv).options(all_opts).positional(pos_opts).run(), config);
+ po::notify(config);
+
+ if (config.count("help")) {
+ error << "Invocation: hadori [ OPTIONS ] [ ARGUMENTS ]" << std::endl;
+ error << opts << std::endl;
+ return EX_USAGE;
+ }
+
+ if (not config.count("debug"))
+ debug.rdbuf(nullptr);
+ if (not config.count("debug") and not config.count("verbose") and not config.count("dry-run"))
+ verbose.rdbuf(nullptr);
+
+ if (config.count("args")) {
+ if (config.count("stdin") or config.count("null")) {
+ // not supported because we don't know which arguments to scan first
+ error << "--stdin combined with commandline arguments, this is not supported." << std::endl;
+ return EX_USAGE;
+ }
+ for(std::string const & dir : config["args"].as< std::vector<std::string> >())
+ recurse_start(dir);
+ } else {
+ if (not config.count("stdin") and not config.count("null"))
+ error << "no arguments supplied, assuming --stdin." << std::endl;
+ char delim = '\n';
+ if (config.count("null"))
+ delim = '\0';
+ for (std::string dir; getline(std::cin, dir, delim);)
+ recurse_start(dir);
+ }
+
+ return EX_OK;
+}
diff --git a/inode.h b/inode.h
new file mode 100644
index 0000000..3040e71
--- /dev/null
+++ b/inode.h
@@ -0,0 +1,64 @@
+#include <string>
+#include <ostream>
+#include <fstream>
+
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <zlib.h>
+
+class inode {
+public:
+ std::string const filename;
+ struct stat const stat;
+protected:
+ uLong mutable adler;
+
+public:
+ inode (std::string const &, struct stat const);
+
+ uLong get_adler () const;
+
+ friend bool compare (inode const &, inode const &);
+ friend std::ostream& operator<< (std::ostream&, inode const &);
+};
+
+inline inode::inode (std::string const & __filename, struct stat const __stat) : filename(__filename), stat(__stat), adler(-1) {
+}
+
+inline uLong inode::get_adler () const {
+ if (adler == uLong(-1)) {
+ char buffer[1 << 14];
+ std::ifstream f(filename.c_str());
+
+ adler = adler32(0L, Z_NULL, 0);
+ while (not f.eof()) {
+ f.read(buffer, sizeof(buffer));
+ adler = adler32(adler, (Bytef *) buffer, f.gcount());
+ }
+ }
+ return adler;
+}
+
+inline bool compare (inode const & l, inode const & r) {
+ char lbuffer[1 << 14];
+ char rbuffer[1 << 14];
+ std::ifstream lf(l.filename.c_str());
+ std::ifstream rf(r.filename.c_str());
+
+ while (not lf.eof()) {
+ lf.read(lbuffer, sizeof(lbuffer));
+ rf.read(rbuffer, sizeof(rbuffer));
+ if (lf.gcount() != rf.gcount())
+ return false;
+ if (memcmp(lbuffer, rbuffer, lf.gcount()))
+ return false;
+ }
+ return true;
+}
+
+inline std::ostream& operator<< (std::ostream& os, inode const & i) {
+ os << "Inode " << i.stat.st_ino << ", represented by " << i.filename;
+ return os;
+}