/*
* Copyright 2000
* Traakan, Inc., Los Altos, CA
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Project: NDMJOB
* Ident: $Id: $
*
* Description:
* Binary Search Text File (BSTF)
*
* Use conventional binary search method on a sorted text
* file. The file MUST be sorted in ascending, lexicographic
* order. This is the default order of sort(1).
*/
#include "ndmlib.h"
#ifdef SELF_TEST
int n_seek, n_align, n_line, n_getc;
#endif
#define MIN_DELTA 2048
/*
* ndmbstf_first()
* ndmbstf_first_with_bounds()
*
* Returns:
* <0 Error
* -1 EOF
* -2 Malformed line/file
* -3 fseek() to determine file size failed
* -4 fseek() to hop failed
* =0 First line found in buf[], does not match key
* >0 Length of first matching line in buf[]
*/
int
ndmbstf_first (
FILE *fp, /* the file to search */
char *key, /* what we're looking for */
char *buf, /* returned line */
unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
{
return ndmbstf_first_with_bounds (fp, key, buf, max_buf, 0, 0);
}
int
ndmbstf_first_with_bounds (
FILE *fp, /* the file to search */
char *key, /* what we're looking for */
char *buf, /* returned line */
unsigned max_buf, /* maximum lenght of buf (sizeof (buf)) */
off_t lower_bound, /* offset, to skip headers, usually 0 */
off_t upper_bound) /* 0->don't know, >0 limit */
{
off_t off;
off_t lower, upper; /* bounds */
off_t delta;
int rc, buf_len;
if (upper_bound == 0) {
off_t end_off;
/*
* Determine the file size using fseek()/ftell()
*/
fseeko (fp, 0, SEEK_END);
end_off = ftello (fp);
if (end_off == -1)
return -3;
upper_bound = end_off;
}
/*
* Set lower and upper bounds of the binary search
*/
lower = lower_bound;
upper = upper_bound;
for (;;) {
/*
* Determine the delta (distance) between the current
* lower and upper bounds. If delta is small, it is more
* efficient to do a linear search than to continue
* seeking. This is because stdio will pre-read
* portions no matter what and fseek()ing will discard
* the pre-read portions. MIN_DELTA is the approximation
* of the stdio pre-read size. Better to just
* linearly process the pre-read portion in the
* hopes that our answer is already sitting in the
* stdio buffer.
*/
delta = upper - lower;
if (delta <= MIN_DELTA)
break;
/*
* Seek to the first line after the midpoint
* between the lower and upper bounds.
*/
off = lower + delta / 2;
rc = ndmbstf_seek_and_align (fp, &off);
if (rc < 0) {
if (rc == EOF) {
/*
* Alignment found a very long line without
* a \n at the end of the file. All we
* can do now is try a linear search
* from the current lower bound.
*/
}
return -4; /* fseek() for hop failed */
}
/*
* Read the next line up into buf[].
*/
buf_len = ndmbstf_getline (fp, buf, max_buf);
if (buf_len <= 0) {
/*
* EOF, or malformed line. All we can do now
* is try a linear search from the current
* lower bound.
*/
break;
}
/*
* This is the crucial point.
*
* buf[] contains a line just read.
* off points the the line we just read.
* key[] is what we're looking for.
*
* Is the objective line (what we're trying to find)
* somewhere between lower..off or between off..upper.
*/
rc = ndmbstf_compare (key, buf);
if (rc > 0) {
/* key>buf. Objective somewhere in off..upper */
lower = off;
} else {
/*
* key<=buf. Objective somewhere in lower..off
* Notice that if key==buf, there still might
* be a line ==key before the one we just
* read. There might be hundreds of such
* lines. The objective is the FIRST line
* greater than or equal to the key.
* This might be it. It might not.
* So, keep searching.
*/
upper = off;
}
}
/*
* Do an unbounded linear search begining at the
* lower bound.
*/
off = lower;
rc = ndmbstf_seek_and_align (fp, &off);
if (rc < 0) {
if (rc == EOF) {
/*
* Alignment found a very long line without
* a \n at the end of the file. All we
* can do is give up.
*/
return -2;
}
return -4; /* fseek() for hop failed */
}
/*
* Read the next line up into buf[].
*/
for (;;) {
buf_len = ndmbstf_getline (fp, buf, max_buf);
if (buf_len <= 0) {
if (buf_len == EOF)
break; /* at end of file */
return -2;
}
rc = ndmbstf_compare (key, buf);
if (rc == 0) {
/* match */
return buf_len;
}
if (rc < 0)
return 0; /* have line but it doesn't match */
}
return EOF;
}
int
ndmbstf_next (
FILE *fp, /* the file to search */
char *key, /* what we're looking for */
char *buf, /* returned line */
unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
{
int rc, buf_len;
/*
* Read the next line up into buf[].
*/
buf_len = ndmbstf_getline (fp, buf, max_buf);
if (buf_len <= 0) {
if (buf_len == EOF)
return EOF; /* at end of file */
return -2; /* malformed line */
}
rc = ndmbstf_compare (key, buf);
if (rc == 0) {
/* match */
return buf_len;
} else {
return 0; /* have line but it doesn't match */
}
}
/*
* ndmbstr_getline()
*
* Returns:
* <0 Error
* -1 EOF
* -2 Malformed line
* >=0 Length of buf
*/
int
ndmbstf_getline (FILE *fp, char *buf, unsigned max_buf)
{
char * p = buf;
char * p_end = buf + max_buf - 2;
int c;
while ((c = getc(fp)) != EOF) {
#ifdef SELF_TEST
n_getc++;
#endif
if (c == '\n')
break;
if (p < p_end)
*p++ = c;
}
*p = 0;
if (c == EOF) {
if (p > buf)
return -2; /* malformed line */
return EOF;
}
#ifdef SELF_TEST
n_line++;
#endif
return p - buf;
}
int
ndmbstf_seek_and_align (FILE *fp, off_t *off)
{
int c;
if (fseeko (fp, *off, SEEK_SET) == -1) {
return -2;
}
#ifdef SELF_TEST
n_seek++;
#endif
/*
* There is a slim chance that we're at the
* true begining of a line. Too slim.
* Scan forward discarding the trailing
* portion of the line we just fseek()ed
* to, and leave the stdio stream positioned
* for the subsequent line. Notice
* we keep off upated so that it reflects
* the seek position of the stdio stream.
*/
while ((c = getc(fp)) != EOF) {
*off += 1;
#ifdef SELF_TEST
n_align++;
#endif
if (c == '\n')
break;
}
if (c == EOF) {
/* at end of file */
return EOF;
}
return 0;
}
/*
* ndmbstf_compare()
*
* Given a key[] and a buf[], return whether or not they match.
* This effectively is strncmp (key, buf, strlen(key)).
* Because of the cost of the call to strlen(), we implement
* it ndmbstf_compare() here with the exact semantics.
*
* Return values are:
* <0 No match, key[] less than buf[]
* =0 Match, the key[] is a prefix for buf[]
* >0 No match, key[] greater than buf[]
*/
int
ndmbstf_compare (char *key, char *buf)
{
char * p = key;
char * q = buf;
while (*p != 0 && *p == *q) {
p++;
q++;
}
if (*p == 0)
return 0; /* entire key matched */
else
return *p - *q;
}
/*
* ndmbstf_match()
*
* A simple wrapper around ndmbstf_compare() with an
* easy-to-use return value..
*
* Returns:
* !0 match
* =0 no match
*/
int
ndmbstf_match (char *key, char *buf)
{
return ndmbstf_compare (key, buf) == 0;
}
#ifdef SELF_TEST
int
main (int ac, char *av[])
{
int i;
FILE * fp;
int verbose = 1;
int total_n_match = 0;
int total_n_no_match = 0;
int total_n_error = 0;
i = 1;
if (i < ac && strcmp (av[i], "-q") == 0) {
i++;
verbose = 0;
}
if (ac < i+2) {
printf ("usage: %s [-q] FILE KEY ...\n", av[0]);
return 1;
}
fp = fopen (av[i], "r");
if (!fp) {
perror (av[i]);
return 2;
}
for (i++; i < ac; i++) {
char buf[512];
char * key = av[i];
int n_match = 0;
int rc;
rc = ndmbstf_first (fp, key, buf, sizeof buf);
if (rc < 0) {
printf ("Key '%s' err=%d\n", key, rc);
total_n_error++;
continue;
}
if (rc == 0) {
printf ("Key '%s' no matches\n", key);
total_n_no_match++;
continue;
}
if (verbose)
printf ("Key '%s'\n", key);
for (; rc > 0; rc = ndmbstf_next (fp, key, buf, sizeof buf)) {
n_match++;
if (verbose)
printf (" %2d: '%s'\n", n_match, buf);
}
total_n_match += n_match;
}
fclose (fp);
printf ("n_match=%d n_miss=%d n_error=%d\n",
total_n_match, total_n_no_match, total_n_error);
printf ("n_seek=%d n_align=%d n_line=%d\n", n_seek, n_align, n_line);
return 0;
}
#endif /* SELF_TEST */