Blame src/hashsig.c

Packit Service 20376f
/*
Packit Service 20376f
 * Copyright (C) the libgit2 contributors. All rights reserved.
Packit Service 20376f
 *
Packit Service 20376f
 * This file is part of libgit2, distributed under the GNU GPL v2 with
Packit Service 20376f
 * a Linking Exception. For full terms see the included COPYING file.
Packit Service 20376f
 */
Packit Service 20376f
#include "git2/sys/hashsig.h"
Packit Service 20376f
#include "fileops.h"
Packit Service 20376f
#include "util.h"
Packit Service 20376f
Packit Service 20376f
typedef uint32_t hashsig_t;
Packit Service 20376f
typedef uint64_t hashsig_state;
Packit Service 20376f
Packit Service 20376f
#define HASHSIG_SCALE 100
Packit Service 20376f
Packit Service 20376f
#define HASHSIG_MAX_RUN 80
Packit Service 20376f
#define HASHSIG_HASH_START	0x012345678ABCDEF0LL
Packit Service 20376f
#define HASHSIG_HASH_SHIFT  5
Packit Service 20376f
Packit Service 20376f
#define HASHSIG_HASH_MIX(S,CH) \
Packit Service 20376f
	(S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
Packit Service 20376f
Packit Service 20376f
#define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
Packit Service 20376f
#define HASHSIG_HEAP_MIN_SIZE 4
Packit Service 20376f
Packit Service 20376f
typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
Packit Service 20376f
Packit Service 20376f
typedef struct {
Packit Service 20376f
	int size, asize;
Packit Service 20376f
	hashsig_cmp cmp;
Packit Service 20376f
	hashsig_t values[HASHSIG_HEAP_SIZE];
Packit Service 20376f
} hashsig_heap;
Packit Service 20376f
Packit Service 20376f
struct git_hashsig {
Packit Service 20376f
	hashsig_heap mins;
Packit Service 20376f
	hashsig_heap maxs;
Packit Service 20376f
	size_t lines;
Packit Service 20376f
	git_hashsig_option_t opt;
Packit Service 20376f
};
Packit Service 20376f
Packit Service 20376f
#define HEAP_LCHILD_OF(I) (((I)<<1)+1)
Packit Service 20376f
#define HEAP_RCHILD_OF(I) (((I)<<1)+2)
Packit Service 20376f
#define HEAP_PARENT_OF(I) (((I)-1)>>1)
Packit Service 20376f
Packit Service 20376f
static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
Packit Service 20376f
{
Packit Service 20376f
	h->size  = 0;
Packit Service 20376f
	h->asize = HASHSIG_HEAP_SIZE;
Packit Service 20376f
	h->cmp   = cmp;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static int hashsig_cmp_max(const void *a, const void *b, void *payload)
Packit Service 20376f
{
Packit Service 20376f
	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
Packit Service 20376f
	GIT_UNUSED(payload);
Packit Service 20376f
	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static int hashsig_cmp_min(const void *a, const void *b, void *payload)
Packit Service 20376f
{
Packit Service 20376f
	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
Packit Service 20376f
	GIT_UNUSED(payload);
Packit Service 20376f
	return (av > bv) ? -1 : (av < bv) ? 1 : 0;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static void hashsig_heap_up(hashsig_heap *h, int el)
Packit Service 20376f
{
Packit Service 20376f
	int parent_el = HEAP_PARENT_OF(el);
Packit Service 20376f
Packit Service 20376f
	while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
Packit Service 20376f
		hashsig_t t = h->values[el];
Packit Service 20376f
		h->values[el] = h->values[parent_el];
Packit Service 20376f
		h->values[parent_el] = t;
Packit Service 20376f
Packit Service 20376f
		el = parent_el;
Packit Service 20376f
		parent_el = HEAP_PARENT_OF(el);
Packit Service 20376f
	}
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static void hashsig_heap_down(hashsig_heap *h, int el)
Packit Service 20376f
{
Packit Service 20376f
	hashsig_t v, lv, rv;
Packit Service 20376f
Packit Service 20376f
	/* 'el < h->size / 2' tests if el is bottom row of heap */
Packit Service 20376f
Packit Service 20376f
	while (el < h->size / 2) {
Packit Service 20376f
		int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
Packit Service 20376f
Packit Service 20376f
		v  = h->values[el];
Packit Service 20376f
		lv = h->values[lel];
Packit Service 20376f
		rv = h->values[rel];
Packit Service 20376f
Packit Service 20376f
		if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
Packit Service 20376f
			break;
Packit Service 20376f
Packit Service 20376f
		swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
Packit Service 20376f
Packit Service 20376f
		h->values[el] = h->values[swapel];
Packit Service 20376f
		h->values[swapel] = v;
Packit Service 20376f
Packit Service 20376f
		el = swapel;
Packit Service 20376f
	}
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static void hashsig_heap_sort(hashsig_heap *h)
Packit Service 20376f
{
Packit Service 20376f
	/* only need to do this at the end for signature comparison */
Packit Service 20376f
	git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
Packit Service 20376f
{
Packit Service 20376f
	/* if heap is not full, insert new element */
Packit Service 20376f
	if (h->size < h->asize) {
Packit Service 20376f
		h->values[h->size++] = val;
Packit Service 20376f
		hashsig_heap_up(h, h->size - 1);
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
	/* if heap is full, pop top if new element should replace it */
Packit Service 20376f
	else if (h->cmp(&val, &h->values[0], NULL) > 0) {
Packit Service 20376f
		h->size--;
Packit Service 20376f
		h->values[0] = h->values[h->size];
Packit Service 20376f
		hashsig_heap_down(h, 0);
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
typedef struct {
Packit Service 20376f
	int use_ignores;
Packit Service 20376f
	uint8_t ignore_ch[256];
Packit Service 20376f
} hashsig_in_progress;
Packit Service 20376f
Packit Service 20376f
static void hashsig_in_progress_init(
Packit Service 20376f
	hashsig_in_progress *prog, git_hashsig *sig)
Packit Service 20376f
{
Packit Service 20376f
	int i;
Packit Service 20376f
Packit Service 20376f
	/* no more than one can be set */
Packit Service 20376f
	assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
Packit Service 20376f
		   !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
Packit Service 20376f
Packit Service 20376f
	if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
Packit Service 20376f
		for (i = 0; i < 256; ++i)
Packit Service 20376f
			prog->ignore_ch[i] = git__isspace_nonlf(i);
Packit Service 20376f
		prog->use_ignores = 1;
Packit Service 20376f
	} else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
Packit Service 20376f
		for (i = 0; i < 256; ++i)
Packit Service 20376f
			prog->ignore_ch[i] = git__isspace(i);
Packit Service 20376f
		prog->use_ignores = 1;
Packit Service 20376f
	} else {
Packit Service 20376f
		memset(prog, 0, sizeof(*prog));
Packit Service 20376f
	}
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static int hashsig_add_hashes(
Packit Service 20376f
	git_hashsig *sig,
Packit Service 20376f
	const uint8_t *data,
Packit Service 20376f
	size_t size,
Packit Service 20376f
	hashsig_in_progress *prog)
Packit Service 20376f
{
Packit Service 20376f
	const uint8_t *scan = data, *end = data + size;
Packit Service 20376f
	hashsig_state state = HASHSIG_HASH_START;
Packit Service 20376f
	int use_ignores = prog->use_ignores, len;
Packit Service 20376f
	uint8_t ch;
Packit Service 20376f
Packit Service 20376f
	while (scan < end) {
Packit Service 20376f
		state = HASHSIG_HASH_START;
Packit Service 20376f
Packit Service 20376f
		for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
Packit Service 20376f
			ch = *scan;
Packit Service 20376f
Packit Service 20376f
			if (use_ignores)
Packit Service 20376f
				for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
Packit Service 20376f
					++scan;
Packit Service 20376f
			else if (sig->opt &
Packit Service 20376f
					 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
Packit Service 20376f
				for (; scan < end && ch == '\r'; ch = *scan)
Packit Service 20376f
					++scan;
Packit Service 20376f
Packit Service 20376f
			/* peek at next character to decide what to do next */
Packit Service 20376f
			if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
Packit Service 20376f
				use_ignores = (ch == '\n');
Packit Service 20376f
Packit Service 20376f
			if (scan >= end)
Packit Service 20376f
				break;
Packit Service 20376f
			++scan;
Packit Service 20376f
Packit Service 20376f
			/* check run terminator */
Packit Service 20376f
			if (ch == '\n' || ch == '\0') {
Packit Service 20376f
				sig->lines++;
Packit Service 20376f
				break;
Packit Service 20376f
			}
Packit Service 20376f
Packit Service 20376f
			++len;
Packit Service 20376f
			HASHSIG_HASH_MIX(state, ch);
Packit Service 20376f
		}
Packit Service 20376f
Packit Service 20376f
		if (len > 0) {
Packit Service 20376f
			hashsig_heap_insert(&sig->mins, (hashsig_t)state);
Packit Service 20376f
			hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
Packit Service 20376f
Packit Service 20376f
			while (scan < end && (*scan == '\n' || !*scan))
Packit Service 20376f
				++scan;
Packit Service 20376f
		}
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
	prog->use_ignores = use_ignores;
Packit Service 20376f
Packit Service 20376f
	return 0;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static int hashsig_finalize_hashes(git_hashsig *sig)
Packit Service 20376f
{
Packit Service 20376f
	if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
Packit Service 20376f
		!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
Packit Service 20376f
		giterr_set(GITERR_INVALID,
Packit Service 20376f
			"file too small for similarity signature calculation");
Packit Service 20376f
		return GIT_EBUFS;
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
	hashsig_heap_sort(&sig->mins);
Packit Service 20376f
	hashsig_heap_sort(&sig->maxs);
Packit Service 20376f
Packit Service 20376f
	return 0;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
Packit Service 20376f
{
Packit Service 20376f
	git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
Packit Service 20376f
	if (!sig)
Packit Service 20376f
		return NULL;
Packit Service 20376f
Packit Service 20376f
	hashsig_heap_init(&sig->mins, hashsig_cmp_min);
Packit Service 20376f
	hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
Packit Service 20376f
	sig->opt = opts;
Packit Service 20376f
Packit Service 20376f
	return sig;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
int git_hashsig_create(
Packit Service 20376f
	git_hashsig **out,
Packit Service 20376f
	const char *buf,
Packit Service 20376f
	size_t buflen,
Packit Service 20376f
	git_hashsig_option_t opts)
Packit Service 20376f
{
Packit Service 20376f
	int error;
Packit Service 20376f
	hashsig_in_progress prog;
Packit Service 20376f
	git_hashsig *sig = hashsig_alloc(opts);
Packit Service 20376f
	GITERR_CHECK_ALLOC(sig);
Packit Service 20376f
Packit Service 20376f
	hashsig_in_progress_init(&prog, sig);
Packit Service 20376f
Packit Service 20376f
	error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog;;
Packit Service 20376f
Packit Service 20376f
	if (!error)
Packit Service 20376f
		error = hashsig_finalize_hashes(sig);
Packit Service 20376f
Packit Service 20376f
	if (!error)
Packit Service 20376f
		*out = sig;
Packit Service 20376f
	else
Packit Service 20376f
		git_hashsig_free(sig);
Packit Service 20376f
Packit Service 20376f
	return error;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
int git_hashsig_create_fromfile(
Packit Service 20376f
	git_hashsig **out,
Packit Service 20376f
	const char *path,
Packit Service 20376f
	git_hashsig_option_t opts)
Packit Service 20376f
{
Packit Service 20376f
	uint8_t buf[0x1000];
Packit Service 20376f
	ssize_t buflen = 0;
Packit Service 20376f
	int error = 0, fd;
Packit Service 20376f
	hashsig_in_progress prog;
Packit Service 20376f
	git_hashsig *sig = hashsig_alloc(opts);
Packit Service 20376f
	GITERR_CHECK_ALLOC(sig);
Packit Service 20376f
Packit Service 20376f
	if ((fd = git_futils_open_ro(path)) < 0) {
Packit Service 20376f
		git__free(sig);
Packit Service 20376f
		return fd;
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
	hashsig_in_progress_init(&prog, sig);
Packit Service 20376f
Packit Service 20376f
	while (!error) {
Packit Service 20376f
		if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
Packit Service 20376f
			if ((error = (int)buflen) < 0)
Packit Service 20376f
				giterr_set(GITERR_OS,
Packit Service 20376f
					"read error on '%s' calculating similarity hashes", path);
Packit Service 20376f
			break;
Packit Service 20376f
		}
Packit Service 20376f
Packit Service 20376f
		error = hashsig_add_hashes(sig, buf, buflen, &prog;;
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
	p_close(fd);
Packit Service 20376f
Packit Service 20376f
	if (!error)
Packit Service 20376f
		error = hashsig_finalize_hashes(sig);
Packit Service 20376f
Packit Service 20376f
	if (!error)
Packit Service 20376f
		*out = sig;
Packit Service 20376f
	else
Packit Service 20376f
		git_hashsig_free(sig);
Packit Service 20376f
Packit Service 20376f
	return error;
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
void git_hashsig_free(git_hashsig *sig)
Packit Service 20376f
{
Packit Service 20376f
	git__free(sig);
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
Packit Service 20376f
{
Packit Service 20376f
	int matches = 0, i, j, cmp;
Packit Service 20376f
Packit Service 20376f
	assert(a->cmp == b->cmp);
Packit Service 20376f
Packit Service 20376f
	/* hash heaps are sorted - just look for overlap vs total */
Packit Service 20376f
Packit Service 20376f
	for (i = 0, j = 0; i < a->size && j < b->size; ) {
Packit Service 20376f
		cmp = a->cmp(&a->values[i], &b->values[j], NULL);
Packit Service 20376f
Packit Service 20376f
		if (cmp < 0)
Packit Service 20376f
			++i;
Packit Service 20376f
		else if (cmp > 0)
Packit Service 20376f
			++j;
Packit Service 20376f
		else {
Packit Service 20376f
			++i; ++j; ++matches;
Packit Service 20376f
		}
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
	return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
Packit Service 20376f
}
Packit Service 20376f
Packit Service 20376f
int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
Packit Service 20376f
{
Packit Service 20376f
	/* if we have no elements in either file then each file is either
Packit Service 20376f
	 * empty or blank.  if we're ignoring whitespace then the files are
Packit Service 20376f
	 * similar, otherwise they're dissimilar.
Packit Service 20376f
	 */
Packit Service 20376f
	if (a->mins.size == 0 && b->mins.size == 0) {
Packit Service 20376f
		if ((!a->lines && !b->lines) ||
Packit Service 20376f
			(a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
Packit Service 20376f
			return HASHSIG_SCALE;
Packit Service 20376f
		else
Packit Service 20376f
			return 0;
Packit Service 20376f
	}
Packit Service 20376f
Packit Service 20376f
	/* if we have fewer than the maximum number of elements, then just use
Packit Service 20376f
	 * one array since the two arrays will be the same
Packit Service 20376f
	 */
Packit Service 20376f
	if (a->mins.size < HASHSIG_HEAP_SIZE)
Packit Service 20376f
		return hashsig_heap_compare(&a->mins, &b->mins);
Packit Service 20376f
	else
Packit Service 20376f
		return (hashsig_heap_compare(&a->mins, &b->mins) +
Packit Service 20376f
				hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
Packit Service 20376f
}