Blob Blame History Raw
/*
 * Copyright 2000
 *	Traakan, Inc., Los Altos, CA
 *	All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice unmodified, this list of conditions, and the following
 *    disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Project:  NDMJOB
 * Ident:    $Id: $
 *
 * Description:
 *	Binary Search Text File (BSTF)
 *
 *	Use conventional binary search method on a sorted text
 *	file. The file MUST be sorted in ascending, lexicographic
 *	order. This is the default order of sort(1).
 */


#include "ndmlib.h"



#ifdef SELF_TEST
int	n_seek, n_align, n_line, n_getc;
#endif


#define MIN_DELTA	2048



/*
 * ndmbstf_first()
 * ndmbstf_first_with_bounds()
 *
 * Returns:
 *	<0	Error
 *	   -1	EOF
 *	   -2	Malformed line/file
 *	   -3	fseek() to determine file size failed
 *	   -4	fseek() to hop failed
 *	=0	First line found in buf[], does not match key
 *	>0	Length of first matching line in buf[]
 */

int
ndmbstf_first (
  FILE *fp,			/* the file to search */
  char *key,			/* what we're looking for */
  char *buf,			/* returned line */
  unsigned max_buf)		/* maximum lenght of buf (sizeof (buf)) */
{
	return ndmbstf_first_with_bounds (fp, key, buf, max_buf, 0, 0);
}

int
ndmbstf_first_with_bounds (
  FILE *fp,			/* the file to search */
  char *key,			/* what we're looking for */
  char *buf,			/* returned line */
  unsigned max_buf,		/* maximum lenght of buf (sizeof (buf)) */
  off_t lower_bound,		/* offset, to skip headers, usually 0 */
  off_t upper_bound)		/* 0->don't know, >0 limit */
{
	off_t		off;
	off_t		lower, upper;		/* bounds */
	off_t		delta;
	int		rc, buf_len;

	if (upper_bound == 0) {
		off_t	end_off;

		/*
		 * Determine the file size using fseek()/ftell()
		 */

		fseeko (fp, 0, SEEK_END);
		end_off = ftello (fp);
		if (end_off == -1)
			return -3;
		upper_bound = end_off;
	}

	/*
	 * Set lower and upper bounds of the binary search
	 */
	lower = lower_bound;
	upper = upper_bound;
	for (;;) {
		/*
		 * Determine the delta (distance) between the current
		 * lower and upper bounds. If delta is small, it is more
		 * efficient to do a linear search than to continue
		 * seeking. This is because stdio will pre-read
		 * portions no matter what and fseek()ing will discard
		 * the pre-read portions. MIN_DELTA is the approximation
		 * of the stdio pre-read size. Better to just
		 * linearly process the pre-read portion in the
		 * hopes that our answer is already sitting in the
		 * stdio buffer.
		 */
		delta = upper - lower;
		if (delta <= MIN_DELTA)
			break;

		/*
		 * Seek to the first line after the midpoint
		 * between the lower and upper bounds.
		 */

		off = lower + delta / 2;
		rc = ndmbstf_seek_and_align (fp, &off);
		if (rc < 0) {
			if (rc == EOF) {
				/*
				 * Alignment found a very long line without
				 * a \n at the end of the file. All we
				 * can do now is try a linear search
				 * from the current lower bound.
				 */
			}
			return -4;	/* fseek() for hop failed */
		}

		/*
		 * Read the next line up into buf[].
		 */

		buf_len = ndmbstf_getline (fp, buf, max_buf);
		if (buf_len <= 0) {
			/*
			 * EOF, or malformed line. All we can do now
			 * is try a linear search from the current
			 * lower bound.
			 */
			break;
		}

		/*
		 * This is the crucial point.
		 *
		 *   buf[]  contains a line just read.
		 *   off    points the the line we just read.
		 *   key[]  is what we're looking for.
		 *
		 * Is the objective line (what we're trying to find)
		 * somewhere between lower..off or between off..upper.
		 */
		rc = ndmbstf_compare (key, buf);
		if (rc > 0) {
			/* key>buf. Objective somewhere in off..upper */
			lower = off;
		} else {
			/*
			 * key<=buf. Objective somewhere in lower..off
			 * Notice that if key==buf, there still might
			 * be a line ==key before the one we just
			 * read. There might be hundreds of such
			 * lines. The objective is the FIRST line
			 * greater than or equal to the key.
			 * This might be it. It might not.
			 * So, keep searching.
			 */
			upper = off;
		}
	}

	/*
	 * Do an unbounded linear search begining at the
	 * lower bound.
	 */

	off = lower;
	rc = ndmbstf_seek_and_align (fp, &off);
	if (rc < 0) {
		if (rc == EOF) {
			/*
			 * Alignment found a very long line without
			 * a \n at the end of the file. All we
			 * can do is give up.
			 */
			return -2;
		}
		return -4;	/* fseek() for hop failed */
	}

	/*
	 * Read the next line up into buf[].
	 */
	for (;;) {
		buf_len = ndmbstf_getline (fp, buf, max_buf);

		if (buf_len <= 0) {
			if (buf_len == EOF)
				break;		/* at end of file */
			return -2;
		}

		rc = ndmbstf_compare (key, buf);
		if (rc == 0) {
			/* match */
			return buf_len;
		}
		if (rc < 0)
			return 0;	/* have line but it doesn't match */
	}

	return EOF;
}

int
ndmbstf_next (
  FILE *fp,			/* the file to search */
  char *key,			/* what we're looking for */
  char *buf,			/* returned line */
  unsigned max_buf)		/* maximum lenght of buf (sizeof (buf)) */
{
	int		rc, buf_len;

	/*
	 * Read the next line up into buf[].
	 */
	buf_len = ndmbstf_getline (fp, buf, max_buf);

	if (buf_len <= 0) {
		if (buf_len == EOF)
			return EOF;		/* at end of file */
		return -2;			/* malformed line */
	}

	rc = ndmbstf_compare (key, buf);
	if (rc == 0) {
		/* match */
		return buf_len;
	} else {
		return 0;	/* have line but it doesn't match */
	}
}


/*
 * ndmbstr_getline()
 *
 * Returns:
 *	<0	Error
 *	   -1	EOF
 *	   -2	Malformed line
 *	>=0	Length of buf
 */

int
ndmbstf_getline (FILE *fp, char *buf, unsigned max_buf)
{
	char *		p = buf;
	char *		p_end = buf + max_buf - 2;
	int		c;

	while ((c = getc(fp)) != EOF) {
#ifdef SELF_TEST
		n_getc++;
#endif
		if (c == '\n')
			break;
		if (p < p_end)
			*p++ = c;
	}
	*p = 0;

	if (c == EOF) {
		if (p > buf)
			return -2;	/* malformed line */
		return EOF;
	}

#ifdef SELF_TEST
	n_line++;
#endif
	return p - buf;
}

int
ndmbstf_seek_and_align (FILE *fp, off_t *off)
{
	int		c;

	if (fseeko (fp, *off, SEEK_SET) == -1) {
		return -2;
	}
#ifdef SELF_TEST
	n_seek++;
#endif

	/*
	 * There is a slim chance that we're at the
	 * true begining of a line. Too slim.
	 * Scan forward discarding the trailing
	 * portion of the line we just fseek()ed
	 * to, and leave the stdio stream positioned
	 * for the subsequent line. Notice
	 * we keep off upated so that it reflects
	 * the seek position of the stdio stream.
	 */

	while ((c = getc(fp)) != EOF) {
		*off += 1;
#ifdef SELF_TEST
		n_align++;
#endif
		if (c == '\n')
			break;
	}
	if (c == EOF) {
		/* at end of file */
		return EOF;
	}

	return 0;
}



/*
 * ndmbstf_compare()
 *
 * Given a key[] and a buf[], return whether or not they match.
 * This effectively is strncmp (key, buf, strlen(key)).
 * Because of the cost of the call to strlen(), we implement
 * it ndmbstf_compare() here with the exact semantics.
 *
 * Return values are:
 *	<0	No match, key[] less than buf[]
 *	=0	Match, the key[] is a prefix for buf[]
 *	>0	No match, key[] greater than buf[]
 */

int
ndmbstf_compare (char *key, char *buf)
{
	char *		p = key;
	char *		q = buf;

	while (*p != 0 && *p == *q) {
		p++;
		q++;
	}

	if (*p == 0)
		return 0;	/* entire key matched */
	else
		return *p - *q;
}


/*
 * ndmbstf_match()
 *
 * A simple wrapper around ndmbstf_compare() with an
 * easy-to-use return value..
 *
 * Returns:
 *	!0	match
 *	=0	no match
 */

int
ndmbstf_match (char *key, char *buf)
{
	return ndmbstf_compare (key, buf) == 0;
}




#ifdef SELF_TEST
int
main (int ac, char *av[])
{
	int		i;
	FILE *		fp;
	int		verbose = 1;
	int		total_n_match = 0;
	int		total_n_no_match = 0;
	int		total_n_error = 0;

	i = 1;
	if (i < ac && strcmp (av[i], "-q") == 0) {
		i++;
		verbose = 0;
	}

	if (ac < i+2) {
		printf ("usage: %s [-q] FILE KEY ...\n", av[0]);
		return 1;
	}

	fp = fopen (av[i], "r");
	if (!fp) {
		perror (av[i]);
		return 2;
	}

	for (i++; i < ac; i++) {
		char		buf[512];
		char *		key = av[i];
		int		n_match = 0;
		int		rc;

		rc = ndmbstf_first (fp, key, buf, sizeof buf);
		if (rc < 0) {
			printf ("Key '%s' err=%d\n", key, rc);
			total_n_error++;
			continue;
		}

		if (rc == 0) {
			printf ("Key '%s' no matches\n", key);
			total_n_no_match++;
			continue;
		}

		if (verbose)
			printf ("Key '%s'\n", key);
		for (; rc > 0; rc = ndmbstf_next (fp, key, buf, sizeof buf)) {
			n_match++;
			if (verbose)
				printf ("    %2d: '%s'\n", n_match, buf);
		}
		total_n_match += n_match;
	}

	fclose (fp);

	printf ("n_match=%d n_miss=%d n_error=%d\n",
		total_n_match, total_n_no_match, total_n_error);
	printf ("n_seek=%d n_align=%d n_line=%d\n", n_seek, n_align, n_line);

	return 0;
}
#endif /* SELF_TEST */