From 148e9eab1109789916a7b55ece957156a57ee4cf Mon Sep 17 00:00:00 2001 From: Timo Weingärtner Date: Sun, 6 Nov 2011 00:00:31 +0100 Subject: initial commit --- Makefile | 15 +++++ TODO | 5 ++ hadori.1 | 32 ++++++++++ hadori.C | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ inode.h | 64 ++++++++++++++++++++ 5 files changed, 321 insertions(+) create mode 100644 Makefile create mode 100644 TODO create mode 100644 hadori.1 create mode 100644 hadori.C create mode 100644 inode.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..736befb --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +LDFLAGS+=-lz -lboost_program_options +CXXFLAGS?=-O2 -Wall +CXXFLAGS+=-std=c++0x +CPPFLAGS+=-D_FILE_OFFSET_BITS=64 + +all: hadori + +hadori.1: hadori + help2man -n $< -o $@ -N --no-discard-stderr --version-string 0.1 ./$< + +hadori: hadori.o +hadori.o: hadori.C inode.h + +clean: + rm -f hadori hadori.o diff --git a/TODO b/TODO new file mode 100644 index 0000000..642824e --- /dev/null +++ b/TODO @@ -0,0 +1,5 @@ +TODO/possible optimizations +=========================== + +* forget inode in to_link if link count is 1 before linking +* more debug output when attributes mismatch in handle_file() diff --git a/hadori.1 b/hadori.1 new file mode 100644 index 0000000..c1218b9 --- /dev/null +++ b/hadori.1 @@ -0,0 +1,32 @@ +.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.40.4. +.TH HADORI "1" "November 2011" "hadori 0.1" "User Commands" +.SH NAME +hadori \- hadori +.SH DESCRIPTION +Invocation: hadori [ OPTIONS ] [ ARGUMENTS ] +OPTIONS: +.TP +\fB\-h\fR [ \fB\-\-help\fR ] +print this help message +.TP +\fB\-t\fR [ \fB\-\-no\-time\fR ] +ignore mtime +.TP +\fB\-\-hash\fR +use adler32 hash to speed up comparing many files with +same size and mostly identical content +.TP +\fB\-n\fR [ \fB\-\-dry\-run\fR ] +don't change anything, implies \fB\-\-verbose\fR +.TP +\fB\-v\fR [ \fB\-\-verbose\fR ] +show which files get linked +.TP +\fB\-d\fR [ \fB\-\-debug\fR ] +show files being examined +.TP +\fB\-s\fR [ \fB\-\-stdin\fR ] +read arguments from stdin, one per line +.TP +\fB\-0\fR [ \fB\-\-null\fR ] +implies \fB\-\-stdin\fR, but use null bytes as delimiter diff --git a/hadori.C b/hadori.C new file mode 100644 index 0000000..fcdebd4 --- /dev/null +++ b/hadori.C @@ -0,0 +1,205 @@ +#include +namespace po = boost::program_options; + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "inode.h" + +std::map kept; +std::map to_link; +std::multimap sizes; +po::variables_map config; +std::ostream debug(std::clog.rdbuf()), verbose(std::clog.rdbuf()), error(std::clog.rdbuf()); + +void do_link (inode const & i, std::string const & other) { + if (!link(i.filename.c_str(), other.c_str())) { + error << "linking " << i << " to " << other << " succeeded before unlinking (race condition)" << std::endl; + exit(EX_UNAVAILABLE); + } + if (errno != EEXIST) { + char * errstring = strerror(errno); + error << "error linking " << i << " to " << other << ": " << errstring << ", nothing bad happened." << std::endl; + exit(EX_UNAVAILABLE); + } + if (unlink(other.c_str())) { + char * errstring = strerror(errno); + error << "error unlinking " << other << " before linking " << i << " to it: " << errstring << std::endl; + exit(EX_UNAVAILABLE); + } + if (link(i.filename.c_str(), other.c_str())) { + char * errstring = strerror(errno); + error << "error linking " << i << " to " << other << ": " << errstring << ", destination filename was already unlinked." << std::endl; + exit(EX_UNAVAILABLE); + } +} + +void handle_file(std::string const & path, struct stat const & s) { + debug << "examining " << path << std::endl; + if (kept.count(s.st_ino)) { + debug << "another link to inode " << s.st_ino << " that we keep" << std::endl; + return; + } + if (to_link.count(s.st_ino)) { + inode const & target = kept.find(to_link[s.st_ino])->second; + debug << "another link to inode " << s.st_ino << " that we merge with " << target << std::endl; + do_link(target, path); + return; + } + inode f(path, s); + debug << f << " is new to us" << std::endl; + for (auto it = sizes.lower_bound(s.st_size); it != sizes.upper_bound(s.st_size); ++it) { + inode const & candidate = kept.find(it->second)->second; + debug << "looking if it matches " << candidate << std::endl; + if (candidate.stat.st_mode != s.st_mode) + continue; + if (candidate.stat.st_uid != s.st_uid) + continue; + if (candidate.stat.st_gid != s.st_gid) + continue; + if (not config.count("no-time")) + if (candidate.stat.st_mtime != s.st_mtime) + continue; + if (config.count("hash")) + if (candidate.get_adler() != f.get_adler()) + continue; + if (!compare(candidate, f)) + continue; + verbose << "linking " << candidate << " to " << path << std::endl; + to_link.insert(std::make_pair(s.st_ino, it->second)); + if (not config.count("dry-run")) + do_link(candidate, path); + return; + } + debug << "we keep " << f << std::endl; + kept.insert(std::make_pair(s.st_ino, f)); + sizes.insert(std::make_pair(s.st_size, s.st_ino)); +} + +void recurse (std::string const & dir, dev_t const dev) { + DIR* D; + struct dirent *d; + struct stat s; + std::queue subdirs; + + if (!(D = opendir(dir.c_str()))) { + char * errstring = strerror(errno); + error << "opendir(\"" << dir << "\"): " << errstring << std::endl; + return; + } + while ((d = readdir(D))) { + std::string path(dir); + path += '/'; + path += d->d_name; + if (lstat(path.c_str(), &s)) { + char * errstring = strerror(errno); + error << "lstat(\"" << path << "\"): " << errstring << std::endl; + continue; + } + if (s.st_dev != dev) { + error << path << " resides on another file system, ignoring." << std::endl; + continue; + } + if (S_ISDIR(s.st_mode)) + subdirs.push(d->d_name); + if (S_ISREG(s.st_mode)) + handle_file(path, s); + } + closedir(D); + // directories get handled after the parent dir is closed to prevent exhausting fds + for (; !subdirs.empty(); subdirs.pop()) { + if (subdirs.front() == "." || subdirs.front() == "..") + continue; + std::string subdir(dir); + subdir += '/'; + subdir += subdirs.front(); + recurse(subdir, dev); + } +} + +void recurse_start (std::string const & dir) { + struct stat s; + + if (lstat(dir.c_str(), &s)) { + char * errstring = strerror(errno); + error << "lstat(\"" << dir << "\"): " << errstring << std::endl; + exit(EX_NOINPUT); + } + + static dev_t const dev = s.st_dev; + if (dev != s.st_dev) { + error << dir << " resides on another file system, ignoring." << std::endl; + return; + } + + if (S_ISDIR(s.st_mode)) + recurse(dir, dev); + if (S_ISREG(s.st_mode)) + handle_file(dir, s); +} + +int main (int const argc, char ** argv) { + po::options_description opts("OPTIONS"); + opts.add_options() + ("help,h", "print this help message") + ("no-time,t", "ignore mtime") + ("hash", "use adler32 hash to speed up comparing many files with same size and mostly identical content") + ("dry-run,n", "don't change anything, implies --verbose") + ("verbose,v", "show which files get linked") + ("debug,d", "show files being examined") + ("stdin,s", "read arguments from stdin, one per line") + ("null,0", "implies --stdin, but use null bytes as delimiter") + ; + po::options_description all_opts; + all_opts.add(opts); + all_opts.add_options() + ("args", po::value< std::vector >(), "files and directories to work on") + ; + po::positional_options_description pos_opts; + pos_opts.add("args", -1); + po::store(po::command_line_parser(argc, argv).options(all_opts).positional(pos_opts).run(), config); + po::notify(config); + + if (config.count("help")) { + error << "Invocation: hadori [ OPTIONS ] [ ARGUMENTS ]" << std::endl; + error << opts << std::endl; + return EX_USAGE; + } + + if (not config.count("debug")) + debug.rdbuf(nullptr); + if (not config.count("debug") and not config.count("verbose") and not config.count("dry-run")) + verbose.rdbuf(nullptr); + + if (config.count("args")) { + if (config.count("stdin") or config.count("null")) { + // not supported because we don't know which arguments to scan first + error << "--stdin combined with commandline arguments, this is not supported." << std::endl; + return EX_USAGE; + } + for(std::string const & dir : config["args"].as< std::vector >()) + recurse_start(dir); + } else { + if (not config.count("stdin") and not config.count("null")) + error << "no arguments supplied, assuming --stdin." << std::endl; + char delim = '\n'; + if (config.count("null")) + delim = '\0'; + for (std::string dir; getline(std::cin, dir, delim);) + recurse_start(dir); + } + + return EX_OK; +} diff --git a/inode.h b/inode.h new file mode 100644 index 0000000..3040e71 --- /dev/null +++ b/inode.h @@ -0,0 +1,64 @@ +#include +#include +#include + +#include +#include +#include + +#include + +class inode { +public: + std::string const filename; + struct stat const stat; +protected: + uLong mutable adler; + +public: + inode (std::string const &, struct stat const); + + uLong get_adler () const; + + friend bool compare (inode const &, inode const &); + friend std::ostream& operator<< (std::ostream&, inode const &); +}; + +inline inode::inode (std::string const & __filename, struct stat const __stat) : filename(__filename), stat(__stat), adler(-1) { +} + +inline uLong inode::get_adler () const { + if (adler == uLong(-1)) { + char buffer[1 << 14]; + std::ifstream f(filename.c_str()); + + adler = adler32(0L, Z_NULL, 0); + while (not f.eof()) { + f.read(buffer, sizeof(buffer)); + adler = adler32(adler, (Bytef *) buffer, f.gcount()); + } + } + return adler; +} + +inline bool compare (inode const & l, inode const & r) { + char lbuffer[1 << 14]; + char rbuffer[1 << 14]; + std::ifstream lf(l.filename.c_str()); + std::ifstream rf(r.filename.c_str()); + + while (not lf.eof()) { + lf.read(lbuffer, sizeof(lbuffer)); + rf.read(rbuffer, sizeof(rbuffer)); + if (lf.gcount() != rf.gcount()) + return false; + if (memcmp(lbuffer, rbuffer, lf.gcount())) + return false; + } + return true; +} + +inline std::ostream& operator<< (std::ostream& os, inode const & i) { + os << "Inode " << i.stat.st_ino << ", represented by " << i.filename; + return os; +} -- cgit v1.2.3