From 148e9eab1109789916a7b55ece957156a57ee4cf Mon Sep 17 00:00:00 2001 From: Timo Weingärtner Date: Sun, 6 Nov 2011 00:00:31 +0100 Subject: initial commit --- hadori.C | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 hadori.C (limited to 'hadori.C') diff --git a/hadori.C b/hadori.C new file mode 100644 index 0000000..fcdebd4 --- /dev/null +++ b/hadori.C @@ -0,0 +1,205 @@ +#include +namespace po = boost::program_options; + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "inode.h" + +std::map kept; +std::map to_link; +std::multimap sizes; +po::variables_map config; +std::ostream debug(std::clog.rdbuf()), verbose(std::clog.rdbuf()), error(std::clog.rdbuf()); + +void do_link (inode const & i, std::string const & other) { + if (!link(i.filename.c_str(), other.c_str())) { + error << "linking " << i << " to " << other << " succeeded before unlinking (race condition)" << std::endl; + exit(EX_UNAVAILABLE); + } + if (errno != EEXIST) { + char * errstring = strerror(errno); + error << "error linking " << i << " to " << other << ": " << errstring << ", nothing bad happened." << std::endl; + exit(EX_UNAVAILABLE); + } + if (unlink(other.c_str())) { + char * errstring = strerror(errno); + error << "error unlinking " << other << " before linking " << i << " to it: " << errstring << std::endl; + exit(EX_UNAVAILABLE); + } + if (link(i.filename.c_str(), other.c_str())) { + char * errstring = strerror(errno); + error << "error linking " << i << " to " << other << ": " << errstring << ", destination filename was already unlinked." << std::endl; + exit(EX_UNAVAILABLE); + } +} + +void handle_file(std::string const & path, struct stat const & s) { + debug << "examining " << path << std::endl; + if (kept.count(s.st_ino)) { + debug << "another link to inode " << s.st_ino << " that we keep" << std::endl; + return; + } + if (to_link.count(s.st_ino)) { + inode const & target = kept.find(to_link[s.st_ino])->second; + debug << "another link to inode " << s.st_ino << " that we merge with " << target << std::endl; + do_link(target, path); + return; + } + inode f(path, s); + debug << f << " is new to us" << std::endl; + for (auto it = sizes.lower_bound(s.st_size); it != sizes.upper_bound(s.st_size); ++it) { + inode const & candidate = kept.find(it->second)->second; + debug << "looking if it matches " << candidate << std::endl; + if (candidate.stat.st_mode != s.st_mode) + continue; + if (candidate.stat.st_uid != s.st_uid) + continue; + if (candidate.stat.st_gid != s.st_gid) + continue; + if (not config.count("no-time")) + if (candidate.stat.st_mtime != s.st_mtime) + continue; + if (config.count("hash")) + if (candidate.get_adler() != f.get_adler()) + continue; + if (!compare(candidate, f)) + continue; + verbose << "linking " << candidate << " to " << path << std::endl; + to_link.insert(std::make_pair(s.st_ino, it->second)); + if (not config.count("dry-run")) + do_link(candidate, path); + return; + } + debug << "we keep " << f << std::endl; + kept.insert(std::make_pair(s.st_ino, f)); + sizes.insert(std::make_pair(s.st_size, s.st_ino)); +} + +void recurse (std::string const & dir, dev_t const dev) { + DIR* D; + struct dirent *d; + struct stat s; + std::queue subdirs; + + if (!(D = opendir(dir.c_str()))) { + char * errstring = strerror(errno); + error << "opendir(\"" << dir << "\"): " << errstring << std::endl; + return; + } + while ((d = readdir(D))) { + std::string path(dir); + path += '/'; + path += d->d_name; + if (lstat(path.c_str(), &s)) { + char * errstring = strerror(errno); + error << "lstat(\"" << path << "\"): " << errstring << std::endl; + continue; + } + if (s.st_dev != dev) { + error << path << " resides on another file system, ignoring." << std::endl; + continue; + } + if (S_ISDIR(s.st_mode)) + subdirs.push(d->d_name); + if (S_ISREG(s.st_mode)) + handle_file(path, s); + } + closedir(D); + // directories get handled after the parent dir is closed to prevent exhausting fds + for (; !subdirs.empty(); subdirs.pop()) { + if (subdirs.front() == "." || subdirs.front() == "..") + continue; + std::string subdir(dir); + subdir += '/'; + subdir += subdirs.front(); + recurse(subdir, dev); + } +} + +void recurse_start (std::string const & dir) { + struct stat s; + + if (lstat(dir.c_str(), &s)) { + char * errstring = strerror(errno); + error << "lstat(\"" << dir << "\"): " << errstring << std::endl; + exit(EX_NOINPUT); + } + + static dev_t const dev = s.st_dev; + if (dev != s.st_dev) { + error << dir << " resides on another file system, ignoring." << std::endl; + return; + } + + if (S_ISDIR(s.st_mode)) + recurse(dir, dev); + if (S_ISREG(s.st_mode)) + handle_file(dir, s); +} + +int main (int const argc, char ** argv) { + po::options_description opts("OPTIONS"); + opts.add_options() + ("help,h", "print this help message") + ("no-time,t", "ignore mtime") + ("hash", "use adler32 hash to speed up comparing many files with same size and mostly identical content") + ("dry-run,n", "don't change anything, implies --verbose") + ("verbose,v", "show which files get linked") + ("debug,d", "show files being examined") + ("stdin,s", "read arguments from stdin, one per line") + ("null,0", "implies --stdin, but use null bytes as delimiter") + ; + po::options_description all_opts; + all_opts.add(opts); + all_opts.add_options() + ("args", po::value< std::vector >(), "files and directories to work on") + ; + po::positional_options_description pos_opts; + pos_opts.add("args", -1); + po::store(po::command_line_parser(argc, argv).options(all_opts).positional(pos_opts).run(), config); + po::notify(config); + + if (config.count("help")) { + error << "Invocation: hadori [ OPTIONS ] [ ARGUMENTS ]" << std::endl; + error << opts << std::endl; + return EX_USAGE; + } + + if (not config.count("debug")) + debug.rdbuf(nullptr); + if (not config.count("debug") and not config.count("verbose") and not config.count("dry-run")) + verbose.rdbuf(nullptr); + + if (config.count("args")) { + if (config.count("stdin") or config.count("null")) { + // not supported because we don't know which arguments to scan first + error << "--stdin combined with commandline arguments, this is not supported." << std::endl; + return EX_USAGE; + } + for(std::string const & dir : config["args"].as< std::vector >()) + recurse_start(dir); + } else { + if (not config.count("stdin") and not config.count("null")) + error << "no arguments supplied, assuming --stdin." << std::endl; + char delim = '\n'; + if (config.count("null")) + delim = '\0'; + for (std::string dir; getline(std::cin, dir, delim);) + recurse_start(dir); + } + + return EX_OK; +} -- cgit v1.2.3