diff options
Diffstat (limited to 'src/kDeDup')
-rw-r--r-- | src/kDeDup/Makefile.kmk | 35 | ||||
-rw-r--r-- | src/kDeDup/kDeDup.c | 1148 |
2 files changed, 1183 insertions, 0 deletions
diff --git a/src/kDeDup/Makefile.kmk b/src/kDeDup/Makefile.kmk new file mode 100644 index 0000000..5fe5213 --- /dev/null +++ b/src/kDeDup/Makefile.kmk @@ -0,0 +1,35 @@ +# $Id: Makefile.kmk 3012 2016-11-07 11:53:11Z bird $ +## @file +# Sub-makefile for kDeDup. +# + +# +# Copyright (c) 2016 knut st. osmundsen <bird-kBuild-spamx@anduin.net> +# +# This file is part of kBuild. +# +# kBuild is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# kBuild is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with kBuild. If not, see <http://www.gnu.org/licenses/> +# +# + +SUB_DEPTH = ../.. +include $(KBUILD_PATH)/subheader.kmk + +PROGRAMS += kDeDup +kDeDup_TEMPLATE = BIN +kDeDup_LIBS = $(LIB_KUTIL) +kDeDup_SOURCES = kDeDup.c + +include $(FILE_KBUILD_SUB_FOOTER) + diff --git a/src/kDeDup/kDeDup.c b/src/kDeDup/kDeDup.c new file mode 100644 index 0000000..58e48cc --- /dev/null +++ b/src/kDeDup/kDeDup.c @@ -0,0 +1,1148 @@ +/* $Id: kDeDup.c 3296 2019-01-22 21:29:08Z bird $ */ +/** @file + * kDeDup - Utility that finds duplicate files, optionally hardlinking them. + */ + +/* + * Copyright (c) 2016 knut st. osmundsen <bird-kBuild-spamx@anduin.net> + * + * This file is part of kBuild. + * + * kBuild is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * kBuild is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with kBuild; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/******************************************************************************* +* Header Files * +*******************************************************************************/ +#include <k/kTypes.h> +#include <errno.h> +#include <string.h> +#include <stdio.h> +#include <wchar.h> +#if K_OS != K_OS_WINDOWS +# include <stdlib.h> +# include <unistd.h> +# include <sys/fcntl.h> +# include <sys/stat.h> +#endif + +#include "md5.h" +//#include "sha2.h" + +#if K_OS == K_OS_WINDOWS +# include "nt/ntstuff.h" +# include "nt/ntstat.h" +# include "nt/fts-nt.h" +# include "nt/nthlp.h" +# include "nt/ntunlink.h" +#else +# include "fts.h" +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * The key is made up of two cryptographic hashes, collisions are + * highly unlikely (once SHA2 is implemented). + */ +typedef struct KDUPFILENODEKEY +{ + /** The MD5 digest of the file. */ + KU8 abMd5[16]; + /** The 256-bit SHA-2 digest of the file. */ + KU8 abSha2[32]; +} KDUPFILENODEKEY; +/** Pointer to a file node.*/ +typedef struct KDUPFILENODE *PKDUPFILENODE; +/** + * Hash tree node. + */ +typedef struct KDUPFILENODE +{ + /** The is made up of two hashes. */ + KDUPFILENODEKEY mKey; + /** Left branch. */ + PKDUPFILENODE mpLeft; + /** Right branch. */ + PKDUPFILENODE mpRight; + /** Tree height (hmm). */ + KU8 mHeight; + + /** The inode number. */ + KU64 uInode; + /** The device number. */ + KU64 uDev; + + /** Pointer to next hard linked node (same inode and udev values). */ + PKDUPFILENODE pNextHardLink; + /** Pointer to next duplicate node. */ + PKDUPFILENODE pNextDup; + /** Pointer to next duplicate node on the global list. */ + PKDUPFILENODE pNextGlobalDup; + + /** The path to this file (variable size). */ +#if K_OS == K_OS_WINDOWS + wchar_t wszPath[1]; +#else + char szPath[1]; +#endif +} KDUPFILENODE; + +#if K_OS == K_OS_WINDOWS +# define PATH_PRI "ls" +# define PATH_MEMB wszPath +# define FTS_ACCPATH fts_wcsaccpath +#else +# define PATH_PRI "s" +# define PATH_MEMB szPath +# define FTS_ACCPATH fts_accpath +#endif + +/*#define KAVL_EQUAL_ALLOWED*/ +#define KAVL_CHECK_FOR_EQUAL_INSERT +#define KAVL_MAX_STACK 32 +/*#define KAVL_RANGE */ +/*#define KAVL_OFFSET */ +/*#define KAVL_STD_KEY_COMP*/ +#define KAVLKEY KDUPFILENODEKEY +#define KAVLNODE KDUPFILENODE +#define KAVL_FN(name) kDupFileTree_ ## name +#define KAVL_TYPE(prefix,name) prefix ## KDUPFILENODE ## name +#define KAVL_INT(name) KDUPFILENODEINT ## name +#define KAVL_DECL(rettype) static rettype +#define KAVL_G(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) > 0 ) +#define KAVL_E(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) == 0 ) +#define KAVL_NE(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) != 0 ) + +#define register +#include <k/kAvlTmpl/kAvlBase.h> +//#include <k/kAvlTmpl/kAvlDoWithAll.h> +//#include <k/kAvlTmpl/kAvlEnum.h> - busted +#include <k/kAvlTmpl/kAvlGet.h> +//#include <k/kAvlTmpl/kAvlGetBestFit.h> +//#include <k/kAvlTmpl/kAvlGetWithParent.h> +//#include <k/kAvlTmpl/kAvlRemove2.h> +//#include <k/kAvlTmpl/kAvlRemoveBestFit.h> +#include <k/kAvlTmpl/kAvlUndef.h> +#undef register + + +/** Pointer to a size tree node. */ +typedef struct KDUPSIZENODE *PKDUPSIZENODE; +/** + * Size tree node. + */ +typedef struct KDUPSIZENODE +{ + /** The file size. */ + KU64 mKey; + /** Left branch. */ + PKDUPSIZENODE mpLeft; + /** Right branch. */ + PKDUPSIZENODE mpRight; + /** Tree height (hmm). */ + KU8 mHeight; + /** Number of files. */ + KU32 cFiles; + /** Tree with same sized files. + * When cFiles is 1 the root node does not have hashes calculated yet. */ + KDUPFILENODEROOT FileRoot; +} KDUPSIZENODE; + +/*#define KAVL_EQUAL_ALLOWED*/ +#define KAVL_CHECK_FOR_EQUAL_INSERT +#define KAVL_MAX_STACK 32 +/*#define KAVL_RANGE */ +/*#define KAVL_OFFSET */ +#define KAVL_STD_KEY_COMP +#define KAVLKEY KU64 +#define KAVLNODE KDUPSIZENODE +#define KAVL_FN(name) kDupSizeTree_ ## name +#define KAVL_TYPE(prefix,name) prefix ## KDUPSIZENODE ## name +#define KAVL_INT(name) KDUPSIZENODEINT ## name +#define KAVL_DECL(rettype) static rettype + +#include <k/kAvlTmpl/kAvlBase.h> +//#include <k/kAvlTmpl/kAvlDoWithAll.h> +//#include <k/kAvlTmpl/kAvlEnum.h> - busted +#include <k/kAvlTmpl/kAvlGet.h> +//#include <k/kAvlTmpl/kAvlGetBestFit.h> +//#include <k/kAvlTmpl/kAvlGetWithParent.h> +//#include <k/kAvlTmpl/kAvlRemove2.h> +//#include <k/kAvlTmpl/kAvlRemoveBestFit.h> +#include <k/kAvlTmpl/kAvlUndef.h> + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** The verbosity level. */ +static unsigned g_cVerbosity = 0; + +/** Whether to recurse into subdirectories. */ +static KBOOL g_fRecursive = K_FALSE; +/** Whether to recurse into symlinked subdirectories. */ +static KBOOL g_fRecursiveViaSymlinks = K_FALSE; +/** Whether to follow symbolicly linked files. */ +static KBOOL g_fFollowSymlinkedFiles = K_TRUE; + +/** Minimum file size to care about. */ +static KU64 g_cbMinFileSize = 1; +/** Maximum file size to care about. */ +static KU64 g_cbMaxFileSize = KU64_MAX; + +/** The root of the size tree. */ +static KDUPSIZENODEROOT g_SizeRoot; + +/** Global list of duplicate file with duplicates. + * @remarks This only contains the files in the hash tree, not the ones on + * the KDUPFILENODE::pNextDup list. */ +static PKDUPFILENODE g_pDuplicateHead = NULL; +/** Where to insert the next file with duplicates. */ +static PKDUPFILENODE *g_ppNextDuplicate = &g_pDuplicateHead; + +/** Number of files we're tracking. */ +static KU64 g_cFiles = 0; +/** Number of hardlinked files or files entered more than once. */ +static KU64 g_cHardlinked = 0; +/** Number of duplicates files (not hardlinked). */ +static KU64 g_cDuplicates = 0; +/** Number of duplicates files that can be hardlinked. */ +static KU64 g_cDuplicatesSaved = 0; +/** Size that could be saved if the duplicates were hardlinked. */ +static KU64 g_cbDuplicatesSaved = 0; + + + +/** + * Wrapper around malloc() that complains when out of memory. + * + * @returns Pointer to allocated memory + * @param cb The size of the memory to allocate. + */ +static void *kDupAlloc(KSIZE cb) +{ + void *pvRet = malloc(cb); + if (pvRet) + return pvRet; + fprintf(stderr, "kDeDup: error: out of memory! (cb=%#zx)\n", cb); + return NULL; +} + +/** Wrapper around free() for symmetry. */ +#define kDupFree(ptr) free(ptr) + +#if K_OS != K_OS_WINDOWS +/** Wrapper around read() that hides EINTR and such. */ +static ssize_t kDupReadFile(int fd, void *pvBuf, size_t cbToRead) +{ + ssize_t cbRet; + do + cbRet = read(fd, pvBuf, cbToRead); + while (cbRet < 0 && errno == EINTR); + if (cbRet > 0 && (size_t)cbRet != cbToRead) + { + for (;;) + { + size_t cbLeft = cbToRead - (size_t)cbRet; + ssize_t cbPart; + do + cbPart = read(fd, (KU8 *)pvBuf + (size_t)cbRet, cbLeft); + while (cbPart < 0 && errno == EINTR); + if (cbPart <= 0) + break; + cbRet += cbPart; + } + } + return cbRet; +} +#endif + + +static void kDupHashFile(PKDUPFILENODE pFileNode, FTSENT *pFtsEnt) +{ + KSIZE i; + PKDUPFILENODE *ppHash; + + /* + * Open the file. + */ +#if K_OS == K_OS_WINDOWS + HANDLE hFile; + if (pFtsEnt && pFtsEnt->fts_parent && pFtsEnt->fts_parent->fts_dirfd != INVALID_HANDLE_VALUE) + hFile = birdOpenFileExW(pFtsEnt->fts_parent->fts_dirfd, pFtsEnt->fts_wcsname, + FILE_READ_DATA | SYNCHRONIZE, + FILE_ATTRIBUTE_NORMAL, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + FILE_OPEN, + FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT, + OBJ_CASE_INSENSITIVE); + else + hFile = birdOpenFileExW(NULL, pFileNode->wszPath, + FILE_READ_DATA | SYNCHRONIZE, + FILE_ATTRIBUTE_NORMAL, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + FILE_OPEN, + FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT, + OBJ_CASE_INSENSITIVE); + if (hFile != INVALID_HANDLE_VALUE) +#else /* K_OS != K_OS_WINDOWS */ +# ifdef O_BINARY + int fd = open(pFileNode->szPath, O_RDONLY | O_BINARY); +# else + int fd = open(pFileNode->szPath, O_RDONLY); +# endif + if (fd >= 0) +#endif /* K_OS != K_OS_WINDOWS */ + { + /* + * Init the hash calculation contexts. + */ + struct MD5Context Md5Ctx; + //SHA256CONTEXT Sha256Ctx; + MD5Init(&Md5Ctx); + //Sha256Init(&Sha256Ctx); + + /* + * Process the file chunk by chunk. + * + * We could complicate this by memory mapping medium sized files, but + * those kind of complications can wait. + */ + for (;;) + { + static KU8 s_abBuffer[2*1024*1024]; +#if K_OS == K_OS_WINDOWS + MY_NTSTATUS rcNt; + MY_IO_STATUS_BLOCK Ios; + Ios.Information = -1; + Ios.u.Status = -1; + rcNt = g_pfnNtReadFile(hFile, NULL /*hEvent*/, NULL /*pfnApc*/, NULL /*pvApcCtx*/, + &Ios, s_abBuffer, sizeof(s_abBuffer), NULL /*poffFile*/, NULL /*puKey*/); + if (MY_NT_SUCCESS(rcNt)) + { + MD5Update(&Md5Ctx, s_abBuffer, (unsigned)Ios.Information); + //SHA256Update(&Sha256Ctx, s_abBuffer, Ios.Information); + } + else if (rcNt != STATUS_END_OF_FILE) + { + fprintf(stderr, "kDeDup: warning: Error reading '%ls': %#x\n", pFileNode->wszPath, rcNt); + break; + } + + /* Check for end of file. */ + if ( rcNt == STATUS_END_OF_FILE + || Ios.Information < sizeof(s_abBuffer)) + { + MD5Final(pFileNode->mKey.abMd5, &Md5Ctx); + //Sha256Final(pFileNode->mKey.abSha2, &Sha256Ctx); + + birdCloseFile(hFile); + return; + } +#else /* K_OS != K_OS_WINDOWS */ + ssize_t cbRead = kDupReadFile(fd, s_abBuffer, sizeof(s_abBuffer)); + if (cbRead > 0) + { + MD5Update(&Md5Ctx, s_abBuffer, (unsigned)cbRead); + //SHA256Update(&Sha256Ctx, s_abBuffer, (unsigned)cbRead); + } + else if (cbRead == 0) + { + MD5Final(pFileNode->mKey.abMd5, &Md5Ctx); + //Sha256Final(pFileNode->mKey.abSha2, &Sha256Ctx); + close(fd); + return; + } + else + { + fprintf(stderr, "kDeDup: warning: Error reading '%s': %s (%d)\n", pFileNode->szPath, strerror(errno), errno); + break; + } +#endif /* K_OS != K_OS_WINDOWS */ + } + +#if K_OS == K_OS_WINDOWS + birdCloseFile(hFile); +#else + close(fd); +#endif + } + else + fprintf(stderr, "kDeDup: warning: Failed to open '%" PATH_PRI "': %s (%d)\n", + pFileNode->PATH_MEMB, strerror(errno), errno); + + /* + * Hashing failed. We fake the digests by repeating the node pointer value + * again and again, holding a collision with both SHA2 and MD5 with similar + * digest pattern for highly unlikely. + */ + ppHash = (PKDUPFILENODE *)&pFileNode->mKey; + i = sizeof(pFileNode->mKey) / sizeof(*ppHash); + while (i-- > 0) + *ppHash++ = pFileNode; +} + + +/** + * Deal with one file, adding it to the tree if it matches the criteria. + * + * @returns 0 on success, non-zero on failure. + * @param pFtsEnt The FTS entry for the file. + */ +static int kDupDoFile(FTSENT *pFtsEnt) +{ + KU64 cbFile; +#if K_OS == K_OS_WINDOWS + struct stat const *pStat = &pFtsEnt->fts_stat; +#else + struct stat const *pStat = pFtsEnt->fts_statp; +#endif + + if (g_cVerbosity >= 2) + printf("debug: kDupDoFile(%" PATH_PRI ")\n", pFtsEnt->FTS_ACCPATH); + + /* + * Check that it's within the size range. + */ + cbFile = pStat->st_size; + if ( cbFile >= g_cbMinFileSize + && cbFile <= g_cbMaxFileSize) + { + /* + * Start out treating this like a unique file with a unique size, i.e. + * allocate all the structures we might possibly need. + */ +#if K_OS == K_OS_WINDOWS + size_t cbAccessPath = (wcslen(pFtsEnt->fts_wcsaccpath) + 1) * sizeof(wchar_t); +#else + size_t cbAccessPath = strlen(pFtsEnt->fts_accpath) + 1; +#endif + PKDUPFILENODE pFileNode = (PKDUPFILENODE)kDupAlloc(sizeof(*pFileNode) + cbAccessPath); + PKDUPSIZENODE pSizeNode = (PKDUPSIZENODE)kDupAlloc(sizeof(*pSizeNode)); + if (!pFileNode || !pSizeNode) + return 3; + g_cFiles++; + + memset(&pFileNode->mKey, 0, sizeof(pFileNode->mKey)); + pFileNode->pNextHardLink = NULL; + pFileNode->pNextDup = NULL; + pFileNode->pNextGlobalDup = NULL; + pFileNode->uDev = pStat->st_dev; + pFileNode->uInode = pStat->st_ino; + memcpy(pFileNode->PATH_MEMB, pFtsEnt->FTS_ACCPATH, cbAccessPath); + + pSizeNode->mKey = cbFile; + pSizeNode->cFiles = 1; + kDupFileTree_Init(&pSizeNode->FileRoot); + kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode); + + /* + * Try insert it. + */ + if (kDupSizeTree_Insert(&g_SizeRoot, pSizeNode)) + { /* unique size, nothing more to do for now. */ } + else + { + /* + * More than one file with this size. We may need to hash the + * hash the file we encountered with this size, if this is the + * second one. In that case we should check for hardlinked or + * double entering of the file first as well. + */ + kDupFree(pSizeNode); + pSizeNode = kDupSizeTree_Get(&g_SizeRoot, cbFile); + if (pSizeNode->cFiles == 1) + { + PKDUPFILENODE pFirstFileNode = pSizeNode->FileRoot.mpRoot; + if ( pFirstFileNode->uInode == pFileNode->uInode + && pFileNode->uInode != 0 + && pFirstFileNode->uDev == pFileNode->uDev) + { + pFileNode->pNextHardLink = pFirstFileNode->pNextHardLink; + pFirstFileNode->pNextHardLink = pFileNode; + if (g_cVerbosity >= 1) + printf("Found hardlinked: '%" PATH_PRI "' -> '%" PATH_PRI "' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n", + pFileNode->PATH_MEMB, pFirstFileNode->PATH_MEMB, pFileNode->uInode, pFileNode->uDev); + g_cHardlinked += 1; + return 0; + } + + kDupHashFile(pFirstFileNode, NULL); + } + kDupHashFile(pFileNode, pFtsEnt); + + if (kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode)) + { /* great, unique content */ } + else + { + /* + * Duplicate content. Could be hardlinked or a duplicate entry. + */ + PKDUPFILENODE pDupFileNode = kDupFileTree_Get(&pSizeNode->FileRoot, pFileNode->mKey); + if ( pDupFileNode->uInode == pFileNode->uInode + && pFileNode->uInode != 0 + && pDupFileNode->uDev == pFileNode->uDev) + { + pFileNode->pNextHardLink = pDupFileNode->pNextHardLink; + pDupFileNode->pNextHardLink = pFileNode; + if (g_cVerbosity >= 1) + printf("Found hardlinked: '%" PATH_PRI "' -> '%" PATH_PRI "' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n", + pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB, pFileNode->uInode, pFileNode->uDev); + g_cHardlinked += 1; + } + else + { + KBOOL fDifferentDev; + + /* Genuinly duplicate (or inode numbers are busted). */ + if (!pDupFileNode->pNextDup) + { + *g_ppNextDuplicate = pDupFileNode; + g_ppNextDuplicate = &pDupFileNode->pNextGlobalDup; + } + + /* The list is sorted by device to better facility hardlinking later. */ + while ( (fDifferentDev = pDupFileNode->uDev != pFileNode->uDev) + && pDupFileNode->pNextDup) + pDupFileNode = pDupFileNode->pNextDup; + + pFileNode->pNextDup = pDupFileNode->pNextDup; + pDupFileNode->pNextDup = pFileNode; + + g_cDuplicates += 1; + if (!fDifferentDev) + { + g_cDuplicatesSaved += 1; +#if K_OS == K_OS_WINDOWS + g_cbDuplicatesSaved += pStat->st_blocks * BIRD_STAT_BLOCK_SIZE; +#else + g_cbDuplicatesSaved += pStat->st_size; +#endif + if (g_cVerbosity >= 1) + printf("Found duplicate: '%" PATH_PRI "' <-> '%" PATH_PRI "'\n", + pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB); + } + else if (g_cVerbosity >= 1) + printf("Found duplicate: '%" PATH_PRI "' <-> '%" PATH_PRI "' (devices differ).\n", + pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB); + } + } + } + } + else if (g_cVerbosity >= 1) + printf("Skipping '%" PATH_PRI "' because %" KU64_PRI " bytes is outside the size range.\n", pFtsEnt->FTS_ACCPATH, cbFile); + return 0; +} + + +/** + * Process the non-option arguments, creating the file tree. + * + * @returns 0 on success, non-zero on failure. + * @param papwszFtsArgs The input in argv style. + * @param fFtsOptions The FTS options. + */ +#if K_OS == K_OS_WINDOWS +static int kDupReadAll(wchar_t **papwszFtsArgs, unsigned fFtsOptions) +#else +static int kDupReadAll(char **papszFtsArgs, unsigned fFtsOptions) +#endif +{ + int rcExit = 0; +#if K_OS == K_OS_WINDOWS + FTS *pFts = nt_fts_openw(papwszFtsArgs, fFtsOptions, NULL /*pfnCompare*/); +#else + FTS *pFts = fts_open(papszFtsArgs, fFtsOptions, NULL /*pfnCompare*/); +#endif + if (pFts != NULL) + { + for (;;) + { +#if K_OS == K_OS_WINDOWS + FTSENT *pFtsEnt = nt_fts_read(pFts); +#else + FTSENT *pFtsEnt = fts_read(pFts); +#endif + if (pFtsEnt) + { + switch (pFtsEnt->fts_info) + { + case FTS_F: + rcExit = kDupDoFile(pFtsEnt); + if (rcExit == 0) + continue; + break; + + case FTS_D: + if ( g_fRecursive + || pFtsEnt->fts_level == FTS_ROOTLEVEL) /* enumerate dirs on the command line */ + continue; +#if K_OS == K_OS_WINDOWS + rcExit = nt_fts_set(pFts, pFtsEnt, FTS_SKIP); +#else + rcExit = fts_set(pFts, pFtsEnt, FTS_SKIP); +#endif + if (rcExit == 0) + continue; + fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n"); + rcExit = 1; + break; + + case FTS_DP: + /* nothing to do here. */ + break; + + case FTS_SL: + { +#if K_OS == K_OS_WINDOWS + /* The nice thing on windows is that we already know whether it's a + directory or file when encountering the symbolic link. */ + if ( (pFtsEnt->fts_stat.st_isdirsymlink ? g_fRecursiveViaSymlinks : g_fFollowSymlinkedFiles) + && pFtsEnt->fts_number == 0) +#else + struct stat St; + if ( pFtsEnt->fts_number == 0 + && ( (g_fRecursiveViaSymlinks && g_fFollowSymlinkedFiles) + || ( stat(pFtsEnt->fts_accpath, &St) == 0 + && (S_ISDIR(St.st_mode) ? g_fRecursiveViaSymlinks : g_fFollowSymlinkedFiles)))) +#endif + { + pFtsEnt->fts_number++; +#if K_OS == K_OS_WINDOWS + rcExit = nt_fts_set(pFts, pFtsEnt, FTS_FOLLOW); +#else + rcExit = fts_set(pFts, pFtsEnt, FTS_FOLLOW); +#endif + if (rcExit == 0) + continue; + fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n"); + rcExit = 1; + } + break; + } + + case FTS_DC: + fprintf(stderr, "kDeDup: warning: Ignoring cycle '%" PATH_PRI "'!\n", pFtsEnt->FTS_ACCPATH); + continue; + + case FTS_NS: + fprintf(stderr, "kDeDup: warning: Failed to stat '%" PATH_PRI "': %s (%d)\n", + pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno); + continue; + + case FTS_DNR: + fprintf(stderr, "kDeDup: error: Error reading directory '%" PATH_PRI "': %s (%d)\n", + pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno); + rcExit = 1; + break; + + case FTS_ERR: + fprintf(stderr, "kDeDup: error: Error on '%" PATH_PRI "': %s (%d)\n", + pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno); + rcExit = 1; + break; + + + /* ignore */ + case FTS_SLNONE: + case FTS_DEFAULT: + break; + + /* Not supposed to get here. */ + default: + fprintf(stderr, "kDeDup: internal error: fts_info=%d - '%" PATH_PRI "'\n", + pFtsEnt->fts_info, pFtsEnt->FTS_ACCPATH); + rcExit = 1; + break; + } + } + else if (errno == 0) + break; + else + { + fprintf(stderr, "kDeDup: error: nt_fts_read failed: %s (%d)\n", strerror(errno), errno); + rcExit = 1; + break; + } + } + +#if K_OS == K_OS_WINDOWS + if (nt_fts_close(pFts) != 0) +#else + if (fts_close(pFts) != 0) +#endif + { + fprintf(stderr, "kDeDup: error: nt_fts_close failed: %s (%d)\n", strerror(errno), errno); + rcExit = 1; + } + } + else + { + fprintf(stderr, "kDeDup: error: nt_fts_openw failed: %s (%d)\n", strerror(errno), errno); + rcExit = 1; + } + + return rcExit; +} + + +/** + * Compares the content of the two files. + * + * @returns 0 if equal, 1 if not equal, -1 on open/read error. + * @param pFile1 The first file. + * @param pFile2 The second file. + */ +static int kDupCompareFiles(PKDUPFILENODE pFile1, PKDUPFILENODE pFile2) +{ +#if K_OS == K_OS_WINDOWS + int rcRet = 0; + K_NOREF(pFile1); + K_NOREF(pFile2); + /** @todo compare files. */ +#else + int rcRet = -1; +# ifdef O_BINARY + int fOpen = O_RDONLY | O_BINARY; +# else + int fOpen = O_RDONLY; +# endif + /* + * Open the two files. + */ + int fd1 = open(pFile1->szPath, fOpen); + if (fd1 >= 0) + { + int fd2 = open(pFile2->szPath, fOpen); + if (fd1 >= 0) + { + /* + * Read and compare all the data. + */ + static KU8 s_abBuf1[2*1024*1024]; + static KU8 s_abBuf2[2*1024*1024]; + KU64 off = 0; + for (;;) + { + ssize_t cb1 = kDupReadFile(fd1, s_abBuf1, sizeof(s_abBuf1)); + ssize_t cb2 = kDupReadFile(fd2, s_abBuf2, sizeof(s_abBuf2)); + if (cb1 < 0 || cb2 < 0) + { + if (cb1 < 0) + fprintf(stderr, "kDeDup: error: reading from '%s': %s (%d)\n", pFile1->szPath, strerror(errno), errno); + if (cb2 < 0) + fprintf(stderr, "kDeDup: error: reading from '%s': %s (%d)\n", pFile2->szPath, strerror(errno), errno); + break; + } + if (cb1 != cb2) + { + fprintf(stderr, "kDeDup: warning: '%s' now differs from '%s' in size...\n", pFile1->szPath, pFile2->szPath); + rcRet = 1; + break; + } + if (cb1 == 0) + { + rcRet = 0; + break; + } + if (memcmp(s_abBuf1, s_abBuf2, cb1) != 0) + { + fprintf(stderr, "kDeDup: warning: hash collision: '%s' differs from '%s' (" KX64_PRI " LB %#x)\n", + pFile1->szPath, pFile2->szPath, off, (unsigned)cb1); + rcRet = 1; + break; + } + off += cb1; + } + + close(fd2); + } + close(fd1); + } +#endif + return rcRet; +} + + +/** + * Hardlink duplicates. + */ +static int kDupHardlinkDuplicates(void) +{ + int rcExit = 0; + PKDUPFILENODE pFileNode; + for (pFileNode = g_pDuplicateHead; pFileNode != NULL; pFileNode = pFileNode->pNextGlobalDup) + { + PKDUPFILENODE pTargetFile = pFileNode; + PKDUPFILENODE pDupFile; + for (pDupFile = pFileNode->pNextDup; pDupFile != NULL; pDupFile = pDupFile->pNextDup) + { + /* + * Can only hard link if the files are on the same device. + */ + if (pDupFile->uDev == pTargetFile->uDev) + { + if (kDupCompareFiles(pDupFile, pTargetFile) == 0) + { + /* + * Start by renaming the orinal file before we try create the hard link. + */ +#if K_OS == K_OS_WINDOWS + static const wchar_t s_wszBackupSuffix[] = L".kDepBackup"; + wchar_t wszBackup[0x4000]; + size_t cwcPath = wcslen(pDupFile->wszPath); + if (cwcPath + sizeof(s_wszBackupSuffix) / sizeof(wchar_t) < K_ELEMENTS(wszBackup)) + { + memcpy(wszBackup, pDupFile->wszPath, cwcPath * sizeof(wchar_t)); + memcpy(&wszBackup[cwcPath], s_wszBackupSuffix, sizeof(s_wszBackupSuffix)); + if (MoveFileW(pDupFile->wszPath, wszBackup)) + { + if (CreateHardLinkW(pDupFile->wszPath, pTargetFile->wszPath, NULL)) + { + if (birdUnlinkForcedW(wszBackup) == 0) + { + if (g_cVerbosity >= 1) + printf("Hardlinked '%ls' to '%ls'.\n", pDupFile->wszPath, pTargetFile->wszPath); + } + else + { + fprintf(stderr, "kDeDup: fatal: failed to delete '%ls' after hardlinking: %s (%d)\n", + wszBackup, strerror(errno), errno); + return 8; + } + } + else + { + fprintf(stderr, "kDeDup: error: failed to hard link '%ls' to '%ls': %u\n", + pDupFile->wszPath, wszBackup, GetLastError()); + if (!MoveFileW(wszBackup, pDupFile->wszPath)) + { + fprintf(stderr, "kDeDup: fatal: Restore '%ls' to '%ls' after hardlinking failed: %u\n", + wszBackup, pDupFile->wszPath, GetLastError()); + return 8; + } + rcExit = 1; + } + } + else + { + fprintf(stderr, "kDeDup: error: failed to rename '%ls' to '%ls': %u\n", + pDupFile->wszPath, wszBackup, GetLastError()); + rcExit = 1; + } + } + else + { + fprintf(stderr, "kDeDup: error: too long backup path: '%ls'\n", pDupFile->wszPath); + rcExit = 1; + } +#else /* K_OS != K_OS_WINDOWS */ + static const char s_szBackupSuffix[] = ".kDepBackup"; + char szBackup[0x4000]; + size_t cchPath = strlen(pDupFile->szPath); + if (cchPath + sizeof(s_szBackupSuffix) < sizeof(szBackup)) + { + struct stat StTmp; + memcpy(szBackup, pDupFile->szPath, cchPath); + memcpy(&szBackup[cchPath], s_szBackupSuffix, sizeof(s_szBackupSuffix)); + if (stat(szBackup, &StTmp) != 0) + { + if (rename(pDupFile->szPath, szBackup) == 0) + { + if (link(pTargetFile->szPath, pDupFile->szPath) == 0) + { + if (unlink(szBackup) == 0) + { + if (g_cVerbosity >= 1) + printf("Hardlinked '%s' to '%s'.\n", pDupFile->szPath, pTargetFile->szPath); + } + else + { + fprintf(stderr, "kDeDup: fatal: failed to delete '%s' after hardlinking: %s (%d)\n", + szBackup, strerror(errno), errno); + return 8; + } + } + else + { + fprintf(stderr, "kDeDup: error: failed to hard link '%s' to '%s': %s (%d)\n", + pDupFile->szPath, szBackup, strerror(errno), errno); + if (rename(szBackup, pDupFile->szPath) != 0) + { + fprintf(stderr, "kDeDup: fatal: Restore '%s' to '%s' after hardlinking failed: %s (%d)\n", + szBackup, pDupFile->szPath, strerror(errno), errno); + return 8; + } + rcExit = 1; + } + } + else + { + fprintf(stderr, "kDeDup: error: failed to rename '%s' to '%s': %s (%d)\n", + pDupFile->szPath, szBackup, strerror(errno), errno); + rcExit = 1; + } + } + else + { + fprintf(stderr, "kDeDup: error: failed to rename '%s' to '%s': file already exist (st_mode=%#x)\n", + pDupFile->szPath, szBackup, StTmp.st_mode); + rcExit = 1; + } + } + else + { + fprintf(stderr, "kDeDup: error: too long backup path: '%s'\n", pDupFile->szPath); + rcExit = 1; + } +#endif /* K_OS != K_OS_WINDOWS */ + } + } + /* + * Since the list is sorted by uDev, we now change the target file. + */ + else + pTargetFile = pDupFile; + } + } + return rcExit; +} + + +static int usage(const char *pszName, FILE *pOut) +{ + fprintf(pOut, + "usage: %s [options] <path1> [path2 [..]]\n" + "usage: %s <-V|--version>\n" + "usage: %s <-h|--help>\n" + , pszName, pszName, pszName); + fprintf(pOut, + "\n" + "Options:\n" + " -H, --dereference-command-line, --no-dereference-command-line\n" + " Follow symbolic links on the command line.\n" + " -L, --dereference\n" + " Follow symbolic links while scanning directories.\n" + " -P, --no-dereference\n" + " Do not follow symbolic links while scanning directories.\n" + " -r, --recursive\n" + " Recurse into subdirectories, but do not follow links to them.\n" + " -R, --recursive-dereference\n" + " Same as -r, but also follow into symlinked subdirectories.\n" + " -x, --one-file-system\n" + " Do not consider other file system (volumes), either down thru a\n" + " mount point or via a symbolic link to a directory.\n" + " --no-one-file-system, --cross-file-systems\n" + " Reverses the effect of --one-file-system.\n" + " -q, --quiet, -v,--verbose\n" + " Controls the output level.\n" + " --hardlink-duplicates\n" + " Hardlink duplicate files to remove duplicates and save space. By default\n" + " no action is taken and only analysis is done.\n" + ); + return 0; +} + + +#if K_OS == K_OS_WINDOWS +int wmain(int argc, wchar_t **argv) +#else +int main(int argc, char **argv) +#endif +{ + int rcExit; + + /* + * Process parameters. Position. + */ + unsigned cFtsArgs = 0; +#if K_OS == K_OS_WINDOWS + wchar_t **papwszFtsArgs = (wchar_t **)calloc(argc + 1, sizeof(wchar_t *)); + unsigned fFtsOptions = FTS_NOCHDIR | FTS_NO_ANSI; +#else + char **papszFtsArgs = (char **)calloc(argc + 1, sizeof(char *)); + unsigned fFtsOptions = FTS_NOCHDIR; +#endif + KBOOL fEndOfOptions = K_FALSE; + KBOOL fHardlinkDups = K_FALSE; + int i; + for (i = 1; i < argc; i++) + { +#if K_OS == K_OS_WINDOWS + wchar_t *pwszArg = argv[i]; + if ( *pwszArg == '-' + && !fEndOfOptions) +#else + char *pszArg = argv[i]; + if ( *pszArg == '-' + && !fEndOfOptions) +#endif + { +#if K_OS != K_OS_WINDOWS + wchar_t wszOpt[1024] = { 0 }; + wchar_t *pwszArg = wszOpt; + mbsrtowcs(wszOpt, (const char **)&pszArg, 1024 - 1, NULL); +#endif + wchar_t wcOpt = *++pwszArg; + pwszArg++; + if (wcOpt == '-') + { + /* Translate long options. */ + if (wcscmp(pwszArg, L"help") == 0) + wcOpt = 'h'; + else if (wcscmp(pwszArg, L"version") == 0) + wcOpt = 'V'; + else if (wcscmp(pwszArg, L"recursive") == 0) + wcOpt = 'r'; + else if (wcscmp(pwszArg, L"dereference-recursive") == 0) + wcOpt = 'R'; + else if (wcscmp(pwszArg, L"dereference") == 0) + wcOpt = 'L'; + else if (wcscmp(pwszArg, L"dereference-command-line") == 0) + wcOpt = 'H'; + else if (wcscmp(pwszArg, L"one-file-system") == 0) + wcOpt = 'x'; + /* Process long options. */ + else if (*pwszArg == '\0') + { + fEndOfOptions = K_TRUE; + continue; + } + else if (wcscmp(pwszArg, L"no-recursive") == 0) + { + g_fRecursive = g_fRecursiveViaSymlinks = K_FALSE; + continue; + } + else if (wcscmp(pwszArg, L"no-dereference-command-line") == 0) + { + fFtsOptions &= ~FTS_COMFOLLOW; + continue; + } + else if ( wcscmp(pwszArg, L"no-one-file-system") == 0 + || wcscmp(pwszArg, L"cross-file-systems") == 0) + { + fFtsOptions &= ~FTS_XDEV; + continue; + } + else if (wcscmp(pwszArg, L"hardlink-duplicates") == 0) + { + fHardlinkDups = K_TRUE; + continue; + } + else + { + fprintf(stderr, "kDeDup: syntax error: Unknown option '--%ls'\n", pwszArg); + return 2; + } + } + + /* Process one or more short options. */ + do + { + switch (wcOpt) + { + case 'r': /* --recursive */ + g_fRecursive = K_TRUE; + break; + + case 'R': /* --dereference-recursive */ + g_fRecursive = g_fRecursiveViaSymlinks = K_TRUE; + break; + + case 'H': /* --dereference-command-line */ + fFtsOptions |= FTS_COMFOLLOW; + break; + + case 'L': /* --dereference*/ + g_fFollowSymlinkedFiles = K_TRUE; + break; + + case 'x': /* --one-file-system*/ + fFtsOptions |= FTS_XDEV; + break; + + case 'q': + g_cVerbosity = 0; + break; + + case 'v': + g_cVerbosity++; + break; + + + case 'h': + case '?': + return usage("kDeDup", stdout); + + case 'V': + printf("0.0.1\n"); + return 0; + + default: +#if K_OS == K_OS_WINDOWS + fprintf(stderr, "kDeDup: syntax error: Unknown option '-%lc'\n", wcOpt); +#else + fprintf(stderr, "kDeDup: syntax error: Unknown option '-%c'\n", (int)wcOpt); +#endif + return 2; + } + + wcOpt = *pwszArg++; + } while (wcOpt != '\0'); + } + else + { + /* + * Append non-option arguments to the FTS argument vector. + */ +#if K_OS == K_OS_WINDOWS + papwszFtsArgs[cFtsArgs] = pwszArg; +#else + papszFtsArgs[cFtsArgs] = pszArg; +#endif + cFtsArgs++; + } + } + + /* + * Do the FTS processing. + */ + kDupSizeTree_Init(&g_SizeRoot); +#if K_OS == K_OS_WINDOWS + rcExit = kDupReadAll(papwszFtsArgs, fFtsOptions); +#else + rcExit = kDupReadAll(papszFtsArgs, fFtsOptions); +#endif + if (rcExit == 0) + { + /* + * Display the result. + */ + printf("Found %" KU64_PRI " duplicate files, out which %" KU64_PRI " can be hardlinked saving %" KU64_PRI " bytes\n", + g_cDuplicates, g_cDuplicatesSaved, g_cbDuplicatesSaved); + + if (fHardlinkDups) + rcExit = kDupHardlinkDuplicates(); + } + + K_NOREF(kDupFileTree_Remove); + K_NOREF(kDupSizeTree_Remove); + return rcExit; +} + |