Skip to content

Commit

Permalink
Improve deduplication (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
chpock authored Oct 3, 2024
1 parent ffaabdc commit 08519ba
Show file tree
Hide file tree
Showing 13 changed files with 1,038 additions and 13 deletions.
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
2024-10-03 Konstantin Kushnir <[email protected]>
* Improve deduplication. Detect adding a file with the same contents
and using previously allocated disk space.

2024-09-15 Konstantin Kushnir <[email protected]>
* Fix a bug with opening local files when their path starts as a VFS mount
point
Expand Down
18 changes: 12 additions & 6 deletions TODO.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
* Store initial -smallfilesize / -smallfilebuffer / -pagesize values that were
specified when first mounted, and use them by default when reopening
an archive for writing. If these parameters were defined when reopening
an archive for writing, then store these new values. This will allow to
use the same parameters each time when an archive is opened for writing.

* add a feature to link with zlib directly, but not using zlib functions
from Tcl. This will allow to use zlib more efficiently as zlib functions
are more suitable to work with plain byte buffers without Tcl objects.
This will be useful when cookfs is statically linked, but not built as
a loadable package.

* Investogate an issue with -directory parameter for glob.
See tests cookfsVfs-1.8.4.1.1 and cookfsVfs-1.8.4.1.2.
Then some variable (object) is passed to -directory argument
Expand All @@ -14,12 +26,6 @@

* Add ability to disable async compression to reduce size footprint when it is not needed

* If a small file is added whose contents match the contents of any existing
small file stored on the pages, it is saved as a separate file.
Add deduplication, so that if a small file is added and its contents exist
in another file, don't duplicate its contents existing in the other file,
but use the existing data.

* Update bzip2 and use it as a submodule. Perhaps this will get rid of its compile-time warnings.

* Add ability/subcommand to copy files from/to cookfs more efficiently than using 'file copy'.
Expand Down
2 changes: 2 additions & 0 deletions generic/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#ifndef COOKFS_COMMON_H
#define COOKFS_COMMON_H 1

#define MD5_DIGEST_SIZE 16

unsigned char *Cookfs_Binary2Int(unsigned char *input, int *output, int count);
unsigned char *Cookfs_Int2Binary(int *input, unsigned char *output, int count);

Expand Down
11 changes: 11 additions & 0 deletions generic/cookfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdint.h>

// #define COOKFS_THREAD_DEBUG

Expand All @@ -42,6 +43,16 @@
#define Tcl_BounceRefCount(x) Tcl_IncrRefCount((x));Tcl_DecrRefCount((x))
#endif

#if !defined(INT2PTR) && !defined(PTR2INT)
# if defined(HAVE_INTPTR_T) || defined(INTPTR_MAX)
# define INT2PTR(p) ((void *)(intptr_t)(p))
# define PTR2INT(p) ((int)(intptr_t)(p))
# else
# define INT2PTR(p) ((void *)(p))
# define PTR2INT(p) ((int)(p))
# endif
#endif

#ifdef COOKFS_INTERNAL_DEBUG

#ifndef __FUNCTION_NAME__
Expand Down
39 changes: 37 additions & 2 deletions generic/fsindex.c
Original file line number Diff line number Diff line change
Expand Up @@ -1136,15 +1136,50 @@ static void Cookfs_FsindexEntryFree(Cookfs_FsindexEntry *e) {
e->next = e->fsindex->inactiveItems;
e->fsindex->inactiveItems = e;
} else {
CookfsLog(printf("Cookfs_FsindexEntryFree: release entry %p",
(void *)e));
// CookfsLog(printf("Cookfs_FsindexEntryFree: release entry %p",
// (void *)e));
#ifdef TCL_THREADS
Tcl_MutexFinalize(&e->mxRefCount);
#endif /* TCL_THREADS */
ckfree((void *) e);
}
}

static void Cookfs_FsindexEntryForeach(Cookfs_FsindexEntry *e, Cookfs_FsindexForeachProc *proc, ClientData clientData) {

proc(e, clientData);

if (e->fileBlocks != COOKFS_NUMBLOCKS_DIRECTORY) {
// Do nothing else if the current entry is not a directory
return;
}

// for directory, recursively free all children
if (e->data.dirInfo.isHash) {
Tcl_HashSearch hashSearch;
Tcl_HashEntry *hashEntry;
// iterate over hash table for all children
hashEntry = Tcl_FirstHashEntry(&e->data.dirInfo.dirData.children, &hashSearch);
while (hashEntry != NULL) {
Cookfs_FsindexEntry *itemNode = (Cookfs_FsindexEntry *)Tcl_GetHashValue(hashEntry);
Cookfs_FsindexEntryForeach(itemNode, proc, clientData);
hashEntry = Tcl_NextHashEntry(&hashSearch);
}
} else {
// iterate through children and free them */
for (int i = 0 ; i < COOKFS_FSINDEX_TABLE_MAXENTRIES; i++) {
if (e->data.dirInfo.dirData.childTable[i] != NULL) {
Cookfs_FsindexEntryForeach(e->data.dirInfo.dirData.childTable[i], proc, clientData);
}
}
}

}

void Cookfs_FsindexForeach(Cookfs_Fsindex *i, Cookfs_FsindexForeachProc *proc, ClientData clientData) {
Cookfs_FsindexWantRead(i);
Cookfs_FsindexEntryForeach(i->rootItem, proc, clientData);
}

/*
*----------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions generic/fsindex.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
typedef struct _Cookfs_Fsindex Cookfs_Fsindex;
typedef struct _Cookfs_FsindexEntry Cookfs_FsindexEntry;

typedef void (Cookfs_FsindexForeachProc)(Cookfs_FsindexEntry *e, ClientData clientData);

Cookfs_Fsindex *Cookfs_FsindexGetHandle(Tcl_Interp *interp, const char *cmdName);

Cookfs_Fsindex *Cookfs_FsindexInit(Tcl_Interp *interp, Cookfs_Fsindex *i);
Expand Down Expand Up @@ -65,6 +67,8 @@ Tcl_WideInt Cookfs_FsindexEntryGetFileTime(Cookfs_FsindexEntry *e);
const char *Cookfs_FsindexEntryGetFileName(Cookfs_FsindexEntry *e,
unsigned char *fileNameLen);

void Cookfs_FsindexForeach(Cookfs_Fsindex *i, Cookfs_FsindexForeachProc *proc, ClientData clientData);

int Cookfs_FsindexEntryUnlock(Cookfs_FsindexEntry *e);
int Cookfs_FsindexEntryLock(Cookfs_FsindexEntry *e);

Expand Down
2 changes: 0 additions & 2 deletions generic/hashes.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

#include "cookfs.h"

#define MD5_DIGEST_SIZE 16

static int CookfsMd5Cmd(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]) {
UNUSED(clientData);
Tcl_Obj *obj;
Expand Down
35 changes: 35 additions & 0 deletions generic/pages.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ int Cookfs_PagesGetLength(Cookfs_Pages *p) {
return Cookfs_PgIndexGetLength(p->pagesIndex);
}

#ifdef COOKFS_USECCRYPTO

int Cookfs_PagesIsEncryptionActive(Cookfs_Pages *p) {
CookfsLog2(printf("return: %d", p->isEncryptionActive));
return p->isEncryptionActive;
}

#endif /* COOKFS_USECCRYPTO */

/*
*----------------------------------------------------------------------
*
Expand Down Expand Up @@ -2678,6 +2687,32 @@ static Cookfs_PageObj CookfsPagesPageGetInt(Cookfs_Pages *p, int index,
return buffer;
}

int Cookfs_PagesGetPageSize(Cookfs_Pages *p, int index) {

// We don't require any locks here as page size is readonly information

if (COOKFS_PAGES_ISASIDE(index)) {
CookfsLog(printf("Detected get request for add-aside pages - %08x", index))
if (p->dataPagesIsAside) {
/* if this pages instance is the aside instance, remove the
* COOKFS_PAGES_ASIDE flag and proceed */
index = index & COOKFS_PAGES_MASK;
CookfsLog(printf("New index = %08x", index))
} else if (p->dataAsidePages != NULL) {
/* if this is not the aside instance, redirect to it */
CookfsLog(printf("Redirecting to add-aside pages object"))
return Cookfs_PagesGetPageSize(p->dataAsidePages, index);
} else {
/* if no aside instance specified, return NULL */
CookfsLog(printf("No add-aside pages defined"))
return -1;
}
}

return Cookfs_PgIndexGetSizeUncompressed(p->pagesIndex, index);

}

/*
*----------------------------------------------------------------------
*
Expand Down
7 changes: 6 additions & 1 deletion generic/pages.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,22 @@ int Cookfs_PagesIsCached(Cookfs_Pages *p, int index);

int Cookfs_PagesIsEncrypted(Cookfs_Pages *p, int index);


Tcl_Obj *Cookfs_PagesGetHashAsObj(Cookfs_Pages *p);
void Cookfs_PagesSetHash(Cookfs_Pages *p, Cookfs_HashType pagehash);
int Cookfs_PagesSetHashByObj(Cookfs_Pages *p, Tcl_Obj *pagehash,
Tcl_Interp *interp);

int Cookfs_PagesGetPageSize(Cookfs_Pages *p, int index);

void Cookfs_PagesCalculateHash(Cookfs_Pages *p, unsigned char *bytes,
Tcl_Size size, unsigned char *output);

int Cookfs_PageAddStamp(Cookfs_Pages *p, Tcl_WideInt size);

#ifdef COOKFS_USECCRYPTO
int Cookfs_PagesIsEncryptionActive(Cookfs_Pages *p);
#endif /* COOKFS_USECCRYPTO */

int Cookfs_PagesUnlock(Cookfs_Pages *p);
int Cookfs_PagesLockRW(int isWrite, Cookfs_Pages *p, Tcl_Obj **err);
#define Cookfs_PagesLockWrite(p,err) Cookfs_PagesLockRW(1,(p),(err))
Expand Down
Loading

0 comments on commit 08519ba

Please sign in to comment.