Blame src/io.c

Packit 33f14e
/* File I/O for GNU DIFF.
Packit 33f14e
Packit 33f14e
   Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
Packit 33f14e
   2015-2017 Free Software Foundation, Inc.
Packit 33f14e
Packit 33f14e
   This file is part of GNU DIFF.
Packit 33f14e
Packit 33f14e
   This program is free software: you can redistribute it and/or modify
Packit 33f14e
   it under the terms of the GNU General Public License as published by
Packit 33f14e
   the Free Software Foundation, either version 3 of the License, or
Packit 33f14e
   (at your option) any later version.
Packit 33f14e
Packit 33f14e
   This program is distributed in the hope that it will be useful,
Packit 33f14e
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 33f14e
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 33f14e
   GNU General Public License for more details.
Packit 33f14e
Packit 33f14e
   You should have received a copy of the GNU General Public License
Packit 33f14e
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit 33f14e
Packit 33f14e
#include "diff.h"
Packit 33f14e
#include <binary-io.h>
Packit 33f14e
#include <cmpbuf.h>
Packit 33f14e
#include <file-type.h>
Packit 33f14e
#include <xalloc.h>
Packit 33f14e
Packit 33f14e
/* Rotate an unsigned value to the left.  */
Packit 33f14e
#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
Packit 33f14e
Packit 33f14e
/* Given a hash value and a new character, return a new hash value.  */
Packit 33f14e
#define HASH(h, c) ((c) + ROL (h, 7))
Packit 33f14e

Packit 33f14e
/* The type of a hash value.  */
Packit 33f14e
typedef size_t hash_value;
Packit 33f14e
verify (! TYPE_SIGNED (hash_value));
Packit 33f14e
Packit 33f14e
/* Lines are put into equivalence classes of lines that match in lines_differ.
Packit 33f14e
   Each equivalence class is represented by one of these structures,
Packit 33f14e
   but only while the classes are being computed.
Packit 33f14e
   Afterward, each class is represented by a number.  */
Packit 33f14e
struct equivclass
Packit 33f14e
{
Packit 33f14e
  lin next;		/* Next item in this bucket.  */
Packit 33f14e
  hash_value hash;	/* Hash of lines in this class.  */
Packit 33f14e
  char const *line;	/* A line that fits this class.  */
Packit 33f14e
  size_t length;	/* That line's length, not counting its newline.  */
Packit 33f14e
};
Packit 33f14e
Packit 33f14e
/* Hash-table: array of buckets, each being a chain of equivalence classes.
Packit 33f14e
   buckets[-1] is reserved for incomplete lines.  */
Packit 33f14e
static lin *buckets;
Packit 33f14e
Packit 33f14e
/* Number of buckets in the hash table array, not counting buckets[-1].  */
Packit 33f14e
static size_t nbuckets;
Packit 33f14e
Packit 33f14e
/* Array in which the equivalence classes are allocated.
Packit 33f14e
   The bucket-chains go through the elements in this array.
Packit 33f14e
   The number of an equivalence class is its index in this array.  */
Packit 33f14e
static struct equivclass *equivs;
Packit 33f14e
Packit 33f14e
/* Index of first free element in the array 'equivs'.  */
Packit 33f14e
static lin equivs_index;
Packit 33f14e
Packit 33f14e
/* Number of elements allocated in the array 'equivs'.  */
Packit 33f14e
static lin equivs_alloc;
Packit 33f14e

Packit 33f14e
/* Read a block of data into a file buffer, checking for EOF and error.  */
Packit 33f14e
Packit 33f14e
void
Packit 33f14e
file_block_read (struct file_data *current, size_t size)
Packit 33f14e
{
Packit 33f14e
  if (size && ! current->eof)
Packit 33f14e
    {
Packit 33f14e
      size_t s = block_read (current->desc,
Packit 33f14e
			     FILE_BUFFER (current) + current->buffered, size);
Packit 33f14e
      if (s == SIZE_MAX)
Packit 33f14e
	pfatal_with_name (current->name);
Packit 33f14e
      current->buffered += s;
Packit 33f14e
      current->eof = s < size;
Packit 33f14e
    }
Packit 33f14e
}
Packit 33f14e

Packit 33f14e
/* Check for binary files and compare them for exact identity.  */
Packit 33f14e
Packit 33f14e
/* Return 1 if BUF contains a non text character.
Packit 33f14e
   SIZE is the number of characters in BUF.  */
Packit 33f14e
Packit 33f14e
#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
Packit 33f14e
Packit 33f14e
/* Get ready to read the current file.
Packit 33f14e
   Return nonzero if SKIP_TEST is zero,
Packit 33f14e
   and if it appears to be a binary file.  */
Packit 33f14e
Packit 33f14e
static bool
Packit 33f14e
sip (struct file_data *current, bool skip_test)
Packit 33f14e
{
Packit 33f14e
  /* If we have a nonexistent file at this stage, treat it as empty.  */
Packit 33f14e
  if (current->desc < 0)
Packit 33f14e
    {
Packit 33f14e
      /* Leave room for a sentinel.  */
Packit 33f14e
      current->bufsize = sizeof (word);
Packit 33f14e
      current->buffer = xmalloc (current->bufsize);
Packit 33f14e
    }
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      current->bufsize = buffer_lcm (sizeof (word),
Packit 33f14e
				     STAT_BLOCKSIZE (current->stat),
Packit 33f14e
				     PTRDIFF_MAX - 2 * sizeof (word));
Packit 33f14e
      current->buffer = xmalloc (current->bufsize);
Packit 33f14e
Packit 33f14e
#ifdef __KLIBC__
Packit 33f14e
      /* Skip test if seek is not possible */
Packit 33f14e
      skip_test = skip_test
Packit 33f14e
		  || (lseek (current->desc, 0, SEEK_CUR) < 0
Packit 33f14e
		      && errno == ESPIPE);
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
      if (! skip_test)
Packit 33f14e
	{
Packit 33f14e
	  /* Check first part of file to see if it's a binary file.  */
Packit 33f14e
Packit 33f14e
	  int prev_mode = set_binary_mode (current->desc, O_BINARY);
Packit 33f14e
	  off_t buffered;
Packit 33f14e
	  file_block_read (current, current->bufsize);
Packit 33f14e
	  buffered = current->buffered;
Packit 33f14e
Packit 33f14e
	  if (prev_mode != O_BINARY)
Packit 33f14e
	    {
Packit 33f14e
	      /* Revert to text mode and seek back to the start to reread
Packit 33f14e
		 the file.  Use relative seek, since file descriptors
Packit 33f14e
		 like stdin might not start at offset zero.  */
Packit 33f14e
	      if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
Packit 33f14e
		pfatal_with_name (current->name);
Packit 33f14e
	      set_binary_mode (current->desc, prev_mode);
Packit 33f14e
	      current->buffered = 0;
Packit 33f14e
	      current->eof = false;
Packit 33f14e
	    }
Packit 33f14e
Packit 33f14e
	  return binary_file_p (current->buffer, buffered);
Packit 33f14e
	}
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  current->buffered = 0;
Packit 33f14e
  current->eof = false;
Packit 33f14e
  return false;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
/* Slurp the rest of the current file completely into memory.  */
Packit 33f14e
Packit 33f14e
static void
Packit 33f14e
slurp (struct file_data *current)
Packit 33f14e
{
Packit 33f14e
  size_t cc;
Packit 33f14e
Packit 33f14e
  if (current->desc < 0)
Packit 33f14e
    {
Packit 33f14e
      /* The file is nonexistent.  */
Packit 33f14e
      return;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  if (S_ISREG (current->stat.st_mode))
Packit 33f14e
    {
Packit 33f14e
      /* It's a regular file; slurp in the rest all at once.  */
Packit 33f14e
Packit 33f14e
      /* Get the size out of the stat block.
Packit 33f14e
	 Allocate just enough room for appended newline plus word sentinel,
Packit 33f14e
	 plus word-alignment since we want the buffer word-aligned.  */
Packit 33f14e
      size_t file_size = current->stat.st_size;
Packit 33f14e
      cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
Packit 33f14e
      if (file_size != current->stat.st_size || cc < file_size
Packit 33f14e
	  || PTRDIFF_MAX <= cc)
Packit 33f14e
	xalloc_die ();
Packit 33f14e
Packit 33f14e
      if (current->bufsize < cc)
Packit 33f14e
	{
Packit 33f14e
	  current->bufsize = cc;
Packit 33f14e
	  current->buffer = xrealloc (current->buffer, cc);
Packit 33f14e
	}
Packit 33f14e
Packit 33f14e
      /* Try to read at least 1 more byte than the size indicates, to
Packit 33f14e
	 detect whether the file is growing.  This is a nicety for
Packit 33f14e
	 users who run 'diff' on files while they are changing.  */
Packit 33f14e
Packit 33f14e
      if (current->buffered <= file_size)
Packit 33f14e
	{
Packit 33f14e
	  file_block_read (current, file_size + 1 - current->buffered);
Packit 33f14e
	  if (current->buffered <= file_size)
Packit 33f14e
	    return;
Packit 33f14e
	}
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  /* It's not a regular file, or it's a growing regular file; read it,
Packit 33f14e
     growing the buffer as needed.  */
Packit 33f14e
Packit 33f14e
  file_block_read (current, current->bufsize - current->buffered);
Packit 33f14e
Packit 33f14e
  if (current->buffered)
Packit 33f14e
    {
Packit 33f14e
      while (current->buffered == current->bufsize)
Packit 33f14e
	{
Packit 33f14e
	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
Packit 33f14e
	    xalloc_die ();
Packit 33f14e
	  current->bufsize *= 2;
Packit 33f14e
	  current->buffer = xrealloc (current->buffer, current->bufsize);
Packit 33f14e
	  file_block_read (current, current->bufsize - current->buffered);
Packit 33f14e
	}
Packit 33f14e
Packit 33f14e
      /* Allocate just enough room for appended newline plus word
Packit 33f14e
	 sentinel, plus word-alignment.  */
Packit 33f14e
      cc = current->buffered + 2 * sizeof (word);
Packit 33f14e
      current->bufsize = cc - cc % sizeof (word);
Packit 33f14e
      current->buffer = xrealloc (current->buffer, current->bufsize);
Packit 33f14e
    }
Packit 33f14e
}
Packit 33f14e

Packit 33f14e
/* Split the file into lines, simultaneously computing the equivalence
Packit 33f14e
   class for each line.  */
Packit 33f14e
Packit 33f14e
static void
Packit 33f14e
find_and_hash_each_line (struct file_data *current)
Packit 33f14e
{
Packit 33f14e
  char const *p = current->prefix_end;
Packit 33f14e
  lin i, *bucket;
Packit 33f14e
  size_t length;
Packit 33f14e
Packit 33f14e
  /* Cache often-used quantities in local variables to help the compiler.  */
Packit 33f14e
  char const **linbuf = current->linbuf;
Packit 33f14e
  lin alloc_lines = current->alloc_lines;
Packit 33f14e
  lin line = 0;
Packit 33f14e
  lin linbuf_base = current->linbuf_base;
Packit 33f14e
  lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
Packit 33f14e
  struct equivclass *eqs = equivs;
Packit 33f14e
  lin eqs_index = equivs_index;
Packit 33f14e
  lin eqs_alloc = equivs_alloc;
Packit 33f14e
  char const *suffix_begin = current->suffix_begin;
Packit 33f14e
  char const *bufend = FILE_BUFFER (current) + current->buffered;
Packit 33f14e
  bool ig_case = ignore_case;
Packit 33f14e
  enum DIFF_white_space ig_white_space = ignore_white_space;
Packit 33f14e
  bool diff_length_compare_anyway =
Packit 33f14e
    ig_white_space != IGNORE_NO_WHITE_SPACE;
Packit 33f14e
  bool same_length_diff_contents_compare_anyway =
Packit 33f14e
    diff_length_compare_anyway | ig_case;
Packit 33f14e
Packit 33f14e
  while (p < suffix_begin)
Packit 33f14e
    {
Packit 33f14e
      char const *ip = p;
Packit 33f14e
      hash_value h = 0;
Packit 33f14e
      unsigned char c;
Packit 33f14e
Packit 33f14e
      /* Hash this line until we find a newline.  */
Packit 33f14e
      switch (ig_white_space)
Packit 33f14e
	{
Packit 33f14e
	case IGNORE_ALL_SPACE:
Packit 33f14e
	  while ((c = *p++) != '\n')
Packit 33f14e
	    if (! isspace (c))
Packit 33f14e
	      h = HASH (h, ig_case ? tolower (c) : c);
Packit 33f14e
	  break;
Packit 33f14e
Packit 33f14e
	case IGNORE_SPACE_CHANGE:
Packit 33f14e
	  while ((c = *p++) != '\n')
Packit 33f14e
	    {
Packit 33f14e
	      if (isspace (c))
Packit 33f14e
		{
Packit 33f14e
		  do
Packit 33f14e
		    if ((c = *p++) == '\n')
Packit 33f14e
		      goto hashing_done;
Packit 33f14e
		  while (isspace (c));
Packit 33f14e
Packit 33f14e
		  h = HASH (h, ' ');
Packit 33f14e
		}
Packit 33f14e
Packit 33f14e
	      /* C is now the first non-space.  */
Packit 33f14e
	      h = HASH (h, ig_case ? tolower (c) : c);
Packit 33f14e
	    }
Packit 33f14e
	  break;
Packit 33f14e
Packit 33f14e
	case IGNORE_TAB_EXPANSION:
Packit 33f14e
	case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
Packit 33f14e
	case IGNORE_TRAILING_SPACE:
Packit 33f14e
	  {
Packit 33f14e
	    size_t column = 0;
Packit 33f14e
	    while ((c = *p++) != '\n')
Packit 33f14e
	      {
Packit 33f14e
		if (ig_white_space & IGNORE_TRAILING_SPACE
Packit 33f14e
		    && isspace (c))
Packit 33f14e
		  {
Packit 33f14e
		    char const *p1 = p;
Packit 33f14e
		    unsigned char c1;
Packit 33f14e
		    do
Packit 33f14e
		      if ((c1 = *p1++) == '\n')
Packit 33f14e
			{
Packit 33f14e
			  p = p1;
Packit 33f14e
			  goto hashing_done;
Packit 33f14e
			}
Packit 33f14e
		    while (isspace (c1));
Packit 33f14e
		  }
Packit 33f14e
Packit 33f14e
		size_t repetitions = 1;
Packit 33f14e
Packit 33f14e
		if (ig_white_space & IGNORE_TAB_EXPANSION)
Packit 33f14e
		  switch (c)
Packit 33f14e
		    {
Packit 33f14e
		    case '\b':
Packit 33f14e
		      column -= 0 < column;
Packit 33f14e
		      break;
Packit 33f14e
Packit 33f14e
		    case '\t':
Packit 33f14e
		      c = ' ';
Packit 33f14e
		      repetitions = tabsize - column % tabsize;
Packit 33f14e
		      column = (column + repetitions < column
Packit 33f14e
				? 0
Packit 33f14e
				: column + repetitions);
Packit 33f14e
		      break;
Packit 33f14e
Packit 33f14e
		    case '\r':
Packit 33f14e
		      column = 0;
Packit 33f14e
		      break;
Packit 33f14e
Packit 33f14e
		    default:
Packit 33f14e
		      column++;
Packit 33f14e
		      break;
Packit 33f14e
		    }
Packit 33f14e
Packit 33f14e
		if (ig_case)
Packit 33f14e
		  c = tolower (c);
Packit 33f14e
Packit 33f14e
		do
Packit 33f14e
		  h = HASH (h, c);
Packit 33f14e
		while (--repetitions != 0);
Packit 33f14e
	      }
Packit 33f14e
	  }
Packit 33f14e
	  break;
Packit 33f14e
Packit 33f14e
	default:
Packit 33f14e
	  if (ig_case)
Packit 33f14e
	    while ((c = *p++) != '\n')
Packit 33f14e
	      h = HASH (h, tolower (c));
Packit 33f14e
	  else
Packit 33f14e
	    while ((c = *p++) != '\n')
Packit 33f14e
	      h = HASH (h, c);
Packit 33f14e
	  break;
Packit 33f14e
	}
Packit 33f14e
Packit 33f14e
   hashing_done:;
Packit 33f14e
Packit 33f14e
      bucket = &buckets[h % nbuckets];
Packit 33f14e
      length = p - ip - 1;
Packit 33f14e
Packit 33f14e
      if (p == bufend
Packit 33f14e
	  && current->missing_newline
Packit 33f14e
	  && ROBUST_OUTPUT_STYLE (output_style))
Packit 33f14e
	{
Packit 33f14e
	  /* The last line is incomplete and we do not silently
Packit 33f14e
	     complete lines.  If the line cannot compare equal to any
Packit 33f14e
	     complete line, put it into buckets[-1] so that it can
Packit 33f14e
	     compare equal only to the other file's incomplete line
Packit 33f14e
	     (if one exists).  */
Packit 33f14e
	  if (ig_white_space < IGNORE_TRAILING_SPACE)
Packit 33f14e
	    bucket = &buckets[-1];
Packit 33f14e
	}
Packit 33f14e
Packit 33f14e
      for (i = *bucket;  ;  i = eqs[i].next)
Packit 33f14e
	if (!i)
Packit 33f14e
	  {
Packit 33f14e
	    /* Create a new equivalence class in this bucket.  */
Packit 33f14e
	    i = eqs_index++;
Packit 33f14e
	    if (i == eqs_alloc)
Packit 33f14e
	      {
Packit 33f14e
		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
Packit 33f14e
		  xalloc_die ();
Packit 33f14e
		eqs_alloc *= 2;
Packit 33f14e
		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
Packit 33f14e
	      }
Packit 33f14e
	    eqs[i].next = *bucket;
Packit 33f14e
	    eqs[i].hash = h;
Packit 33f14e
	    eqs[i].line = ip;
Packit 33f14e
	    eqs[i].length = length;
Packit 33f14e
	    *bucket = i;
Packit 33f14e
	    break;
Packit 33f14e
	  }
Packit 33f14e
	else if (eqs[i].hash == h)
Packit 33f14e
	  {
Packit 33f14e
	    char const *eqline = eqs[i].line;
Packit 33f14e
Packit 33f14e
	    /* Reuse existing class if lines_differ reports the lines
Packit 33f14e
               equal.  */
Packit 33f14e
	    if (eqs[i].length == length)
Packit 33f14e
	      {
Packit 33f14e
		/* Reuse existing equivalence class if the lines are identical.
Packit 33f14e
		   This detects the common case of exact identity
Packit 33f14e
		   faster than lines_differ would.  */
Packit 33f14e
		if (memcmp (eqline, ip, length) == 0)
Packit 33f14e
		  break;
Packit 33f14e
		if (!same_length_diff_contents_compare_anyway)
Packit 33f14e
		  continue;
Packit 33f14e
	      }
Packit 33f14e
	    else if (!diff_length_compare_anyway)
Packit 33f14e
	      continue;
Packit 33f14e
Packit Service 3a92f2
	    if (! lines_differ (eqline, ip))
Packit 33f14e
	      break;
Packit 33f14e
	  }
Packit 33f14e
Packit 33f14e
      /* Maybe increase the size of the line table.  */
Packit 33f14e
      if (line == alloc_lines)
Packit 33f14e
	{
Packit 33f14e
	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
Packit 33f14e
	  if (PTRDIFF_MAX / 3 <= alloc_lines
Packit 33f14e
	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
Packit 33f14e
	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
Packit 33f14e
	    xalloc_die ();
Packit 33f14e
	  alloc_lines = 2 * alloc_lines - linbuf_base;
Packit 33f14e
	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
Packit 33f14e
	  linbuf += linbuf_base;
Packit 33f14e
	  linbuf = xrealloc (linbuf,
Packit 33f14e
			     (alloc_lines - linbuf_base) * sizeof *linbuf);
Packit 33f14e
	  linbuf -= linbuf_base;
Packit 33f14e
	}
Packit 33f14e
      linbuf[line] = ip;
Packit 33f14e
      cureqs[line] = i;
Packit 33f14e
      ++line;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  current->buffered_lines = line;
Packit 33f14e
Packit 33f14e
  for (i = 0;  ;  i++)
Packit 33f14e
    {
Packit 33f14e
      /* Record the line start for lines in the suffix that we care about.
Packit 33f14e
	 Record one more line start than lines,
Packit 33f14e
	 so that we can compute the length of any buffered line.  */
Packit 33f14e
      if (line == alloc_lines)
Packit 33f14e
	{
Packit 33f14e
	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
Packit 33f14e
	  if (PTRDIFF_MAX / 3 <= alloc_lines
Packit 33f14e
	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
Packit 33f14e
	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
Packit 33f14e
	    xalloc_die ();
Packit 33f14e
	  alloc_lines = 2 * alloc_lines - linbuf_base;
Packit 33f14e
	  linbuf += linbuf_base;
Packit 33f14e
	  linbuf = xrealloc (linbuf,
Packit 33f14e
			     (alloc_lines - linbuf_base) * sizeof *linbuf);
Packit 33f14e
	  linbuf -= linbuf_base;
Packit 33f14e
	}
Packit 33f14e
      linbuf[line] = p;
Packit 33f14e
Packit 33f14e
      if (p == bufend)
Packit 33f14e
	{
Packit 33f14e
	  /* If the last line is incomplete and we do not silently
Packit 33f14e
	     complete lines, don't count its appended newline.  */
Packit 33f14e
	  if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style))
Packit 33f14e
	    linbuf[line]--;
Packit 33f14e
	  break;
Packit 33f14e
	}
Packit 33f14e
Packit 33f14e
      if (context <= i && no_diff_means_no_output)
Packit 33f14e
	break;
Packit 33f14e
Packit 33f14e
      line++;
Packit 33f14e
Packit 33f14e
      while (*p++ != '\n')
Packit 33f14e
	continue;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  /* Done with cache in local variables.  */
Packit 33f14e
  current->linbuf = linbuf;
Packit 33f14e
  current->valid_lines = line;
Packit 33f14e
  current->alloc_lines = alloc_lines;
Packit 33f14e
  current->equivs = cureqs;
Packit 33f14e
  equivs = eqs;
Packit 33f14e
  equivs_alloc = eqs_alloc;
Packit 33f14e
  equivs_index = eqs_index;
Packit 33f14e
}
Packit 33f14e

Packit 33f14e
/* Prepare the text.  Make sure the text end is initialized.
Packit 33f14e
   Make sure text ends in a newline,
Packit 33f14e
   but remember that we had to add one.
Packit 33f14e
   Strip trailing CRs, if that was requested.  */
Packit 33f14e
Packit 33f14e
static void
Packit 33f14e
prepare_text (struct file_data *current)
Packit 33f14e
{
Packit 33f14e
  size_t buffered = current->buffered;
Packit 33f14e
  char *p = FILE_BUFFER (current);
Packit 33f14e
Packit 33f14e
  if (buffered == 0 || p[buffered - 1] == '\n')
Packit 33f14e
    current->missing_newline = false;
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      p[buffered++] = '\n';
Packit 33f14e
      current->missing_newline = true;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  if (!p)
Packit 33f14e
    return;
Packit 33f14e
Packit 33f14e
  /* Don't use uninitialized storage when planting or using sentinels.  */
Packit 33f14e
  memset (p + buffered, 0, sizeof (word));
Packit 33f14e
Packit 33f14e
  if (strip_trailing_cr)
Packit 33f14e
    {
Packit 33f14e
      char *dst;
Packit 33f14e
      char *srclim = p + buffered;
Packit 33f14e
      *srclim = '\r';
Packit 33f14e
      dst = rawmemchr (p, '\r');
Packit 33f14e
Packit 33f14e
      if (dst != srclim)
Packit 33f14e
	{
Packit 33f14e
	  char const *src = dst;
Packit 33f14e
	  do
Packit 33f14e
	    {
Packit 33f14e
	      *dst = *src++;
Packit 33f14e
	      dst += ! (*dst == '\r' && *src == '\n');
Packit 33f14e
	    }
Packit 33f14e
	  while (src < srclim);
Packit 33f14e
Packit 33f14e
	  buffered -= src - dst;
Packit 33f14e
	}
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  current->buffered = buffered;
Packit 33f14e
}
Packit 33f14e

Packit 33f14e
/* We have found N lines in a buffer of size S; guess the
Packit 33f14e
   proportionate number of lines that will be found in a buffer of
Packit 33f14e
   size T.  However, do not guess a number of lines so large that the
Packit 33f14e
   resulting line table might cause overflow in size calculations.  */
Packit 33f14e
static lin
Packit 33f14e
guess_lines (lin n, size_t s, size_t t)
Packit 33f14e
{
Packit 33f14e
  size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
Packit 33f14e
  lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
Packit 33f14e
  return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
/* Given a vector of two file_data objects, find the identical
Packit 33f14e
   prefixes and suffixes of each object.  */
Packit 33f14e
Packit 33f14e
static void
Packit 33f14e
find_identical_ends (struct file_data filevec[])
Packit 33f14e
{
Packit 33f14e
  word *w0, *w1;
Packit 33f14e
  char *p0, *p1, *buffer0, *buffer1;
Packit 33f14e
  char const *end0, *beg0;
Packit 33f14e
  char const **linbuf0, **linbuf1;
Packit 33f14e
  lin i, lines;
Packit 33f14e
  size_t n0, n1;
Packit 33f14e
  lin alloc_lines0, alloc_lines1;
Packit 33f14e
  bool prefix_needed;
Packit 33f14e
  lin buffered_prefix, prefix_count, prefix_mask;
Packit 33f14e
  lin middle_guess, suffix_guess;
Packit 33f14e
Packit 33f14e
  slurp (&filevec[0]);
Packit 33f14e
  prepare_text (&filevec[0]);
Packit 33f14e
  if (filevec[0].desc != filevec[1].desc)
Packit 33f14e
    {
Packit 33f14e
      slurp (&filevec[1]);
Packit 33f14e
      prepare_text (&filevec[1]);
Packit 33f14e
    }
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      filevec[1].buffer = filevec[0].buffer;
Packit 33f14e
      filevec[1].bufsize = filevec[0].bufsize;
Packit 33f14e
      filevec[1].buffered = filevec[0].buffered;
Packit 33f14e
      filevec[1].missing_newline = filevec[0].missing_newline;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  /* Find identical prefix.  */
Packit 33f14e
Packit 33f14e
  w0 = filevec[0].buffer;
Packit 33f14e
  w1 = filevec[1].buffer;
Packit 33f14e
  p0 = buffer0 = (char *) w0;
Packit 33f14e
  p1 = buffer1 = (char *) w1;
Packit 33f14e
  n0 = filevec[0].buffered;
Packit 33f14e
  n1 = filevec[1].buffered;
Packit 33f14e
Packit 33f14e
  if (p0 == p1)
Packit 33f14e
    /* The buffers are the same; sentinels won't work.  */
Packit 33f14e
    p0 = p1 += n1;
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      /* Insert end sentinels, in this case characters that are guaranteed
Packit 33f14e
	 to make the equality test false, and thus terminate the loop.  */
Packit 33f14e
Packit 33f14e
      if (n0 < n1)
Packit 33f14e
	p0[n0] = ~p1[n0];
Packit 33f14e
      else
Packit 33f14e
	p1[n1] = ~p0[n1];
Packit 33f14e
Packit 33f14e
      /* Loop until first mismatch, or to the sentinel characters.  */
Packit 33f14e
Packit 33f14e
      /* Compare a word at a time for speed.  */
Packit 33f14e
      while (*w0 == *w1)
Packit 33f14e
	w0++, w1++;
Packit 33f14e
Packit 33f14e
      /* Do the last few bytes of comparison a byte at a time.  */
Packit 33f14e
      p0 = (char *) w0;
Packit 33f14e
      p1 = (char *) w1;
Packit 33f14e
      while (*p0 == *p1)
Packit 33f14e
	p0++, p1++;
Packit 33f14e
Packit 33f14e
      /* Don't mistakenly count missing newline as part of prefix.  */
Packit 33f14e
      if (ROBUST_OUTPUT_STYLE (output_style)
Packit 33f14e
	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
Packit 33f14e
	      !=
Packit 33f14e
	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
Packit 33f14e
	p0--, p1--;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  /* Now P0 and P1 point at the first nonmatching characters.  */
Packit 33f14e
Packit 33f14e
  /* Skip back to last line-beginning in the prefix,
Packit 33f14e
     and then discard up to HORIZON_LINES lines from the prefix.  */
Packit 33f14e
  i = horizon_lines;
Packit 33f14e
  while (p0 != buffer0 && (p0[-1] != '\n' || i--))
Packit 33f14e
    p0--, p1--;
Packit 33f14e
Packit 33f14e
  /* Record the prefix.  */
Packit 33f14e
  filevec[0].prefix_end = p0;
Packit 33f14e
  filevec[1].prefix_end = p1;
Packit 33f14e
Packit 33f14e
  /* Find identical suffix.  */
Packit 33f14e
Packit 33f14e
  /* P0 and P1 point beyond the last chars not yet compared.  */
Packit 33f14e
  p0 = buffer0 + n0;
Packit 33f14e
  p1 = buffer1 + n1;
Packit 33f14e
Packit 33f14e
  if (! ROBUST_OUTPUT_STYLE (output_style)
Packit 33f14e
      || filevec[0].missing_newline == filevec[1].missing_newline)
Packit 33f14e
    {
Packit 33f14e
      end0 = p0;	/* Addr of last char in file 0.  */
Packit 33f14e
Packit 33f14e
      /* Get value of P0 at which we should stop scanning backward:
Packit 33f14e
	 this is when either P0 or P1 points just past the last char
Packit 33f14e
	 of the identical prefix.  */
Packit 33f14e
      beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
Packit 33f14e
Packit 33f14e
      /* Scan back until chars don't match or we reach that point.  */
Packit 33f14e
      while (p0 != beg0)
Packit 33f14e
	if (*--p0 != *--p1)
Packit 33f14e
	  {
Packit 33f14e
	    /* Point at the first char of the matching suffix.  */
Packit 33f14e
	    ++p0, ++p1;
Packit 33f14e
	    beg0 = p0;
Packit 33f14e
	    break;
Packit 33f14e
	  }
Packit 33f14e
Packit 33f14e
      /* Are we at a line-beginning in both files?  If not, add the rest of
Packit 33f14e
	 this line to the main body.  Discard up to HORIZON_LINES lines from
Packit 33f14e
	 the identical suffix.  Also, discard one extra line,
Packit 33f14e
	 because shift_boundaries may need it.  */
Packit 33f14e
      i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
Packit 33f14e
			    &&
Packit 33f14e
			    (buffer1 == p1 || p1[-1] == '\n'));
Packit 33f14e
      while (i-- && p0 != end0)
Packit 33f14e
	while (*p0++ != '\n')
Packit 33f14e
	  continue;
Packit 33f14e
Packit 33f14e
      p1 += p0 - beg0;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  /* Record the suffix.  */
Packit 33f14e
  filevec[0].suffix_begin = p0;
Packit 33f14e
  filevec[1].suffix_begin = p1;
Packit 33f14e
Packit 33f14e
  /* Calculate number of lines of prefix to save.
Packit 33f14e
Packit 33f14e
     prefix_count == 0 means save the whole prefix;
Packit 33f14e
     we need this for options like -D that output the whole file,
Packit 33f14e
     or for enormous contexts (to avoid worrying about arithmetic overflow).
Packit 33f14e
     We also need it for options like -F that output some preceding line;
Packit 33f14e
     at least we will need to find the last few lines,
Packit 33f14e
     but since we don't know how many, it's easiest to find them all.
Packit 33f14e
Packit 33f14e
     Otherwise, prefix_count != 0.  Save just prefix_count lines at start
Packit 33f14e
     of the line buffer; they'll be moved to the proper location later.
Packit 33f14e
     Handle 1 more line than the context says (because we count 1 too many),
Packit 33f14e
     rounded up to the next power of 2 to speed index computation.  */
Packit 33f14e
Packit 33f14e
  if (no_diff_means_no_output && ! function_regexp.fastmap
Packit 33f14e
      && context < LIN_MAX / 4 && context < n0)
Packit 33f14e
    {
Packit 33f14e
      middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
Packit 33f14e
      suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
Packit 33f14e
      for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
Packit 33f14e
	continue;
Packit 33f14e
      alloc_lines0 = (prefix_count + middle_guess
Packit 33f14e
		      + MIN (context, suffix_guess));
Packit 33f14e
    }
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      prefix_count = 0;
Packit 33f14e
      alloc_lines0 = guess_lines (0, 0, n0);
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  prefix_mask = prefix_count - 1;
Packit 33f14e
  lines = 0;
Packit 33f14e
  linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
Packit 33f14e
  prefix_needed = ! (no_diff_means_no_output
Packit 33f14e
		     && filevec[0].prefix_end == p0
Packit 33f14e
		     && filevec[1].prefix_end == p1);
Packit 33f14e
  p0 = buffer0;
Packit 33f14e
Packit 33f14e
  /* If the prefix is needed, find the prefix lines.  */
Packit 33f14e
  if (prefix_needed)
Packit 33f14e
    {
Packit 33f14e
      end0 = filevec[0].prefix_end;
Packit 33f14e
      while (p0 != end0)
Packit 33f14e
	{
Packit 33f14e
	  lin l = lines++ & prefix_mask;
Packit 33f14e
	  if (l == alloc_lines0)
Packit 33f14e
	    {
Packit 33f14e
	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
Packit 33f14e
		xalloc_die ();
Packit 33f14e
	      alloc_lines0 *= 2;
Packit 33f14e
	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
Packit 33f14e
	    }
Packit 33f14e
	  linbuf0[l] = p0;
Packit 33f14e
	  while (*p0++ != '\n')
Packit 33f14e
	    continue;
Packit 33f14e
	}
Packit 33f14e
    }
Packit 33f14e
  buffered_prefix = prefix_count && context < lines ? context : lines;
Packit 33f14e
Packit 33f14e
  /* Allocate line buffer 1.  */
Packit 33f14e
Packit 33f14e
  middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
Packit 33f14e
  suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
Packit 33f14e
  alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
Packit 33f14e
  if (alloc_lines1 < buffered_prefix
Packit 33f14e
      || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
Packit 33f14e
    xalloc_die ();
Packit 33f14e
  linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
Packit 33f14e
Packit 33f14e
  if (buffered_prefix != lines)
Packit 33f14e
    {
Packit 33f14e
      /* Rotate prefix lines to proper location.  */
Packit 33f14e
      for (i = 0;  i < buffered_prefix;  i++)
Packit 33f14e
	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
Packit 33f14e
      for (i = 0;  i < buffered_prefix;  i++)
Packit 33f14e
	linbuf0[i] = linbuf1[i];
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  /* Initialize line buffer 1 from line buffer 0.  */
Packit 33f14e
  for (i = 0; i < buffered_prefix; i++)
Packit 33f14e
    linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
Packit 33f14e
Packit 33f14e
  /* Record the line buffer, adjusted so that
Packit 33f14e
     linbuf[0] points at the first differing line.  */
Packit 33f14e
  filevec[0].linbuf = linbuf0 + buffered_prefix;
Packit 33f14e
  filevec[1].linbuf = linbuf1 + buffered_prefix;
Packit 33f14e
  filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
Packit 33f14e
  filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
Packit 33f14e
  filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
Packit 33f14e
  filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
Packit 33f14e
}
Packit 33f14e

Packit 33f14e
/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
Packit 33f14e
   than 2**k.  This table is derived from Chris K. Caldwell's list
Packit 33f14e
   <http://www.utm.edu/research/primes/lists/2small/>.  */
Packit 33f14e
Packit 33f14e
static unsigned char const prime_offset[] =
Packit 33f14e
{
Packit 33f14e
  0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
Packit 33f14e
  15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
Packit 33f14e
  11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
Packit 33f14e
  55, 93, 1, 57, 25
Packit 33f14e
};
Packit 33f14e
Packit 33f14e
/* Verify that this host's size_t is not too wide for the above table.  */
Packit 33f14e
Packit 33f14e
verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
Packit 33f14e
Packit 33f14e
/* Given a vector of two file_data objects, read the file associated
Packit 33f14e
   with each one, and build the table of equivalence classes.
Packit 33f14e
   Return nonzero if either file appears to be a binary file.
Packit 33f14e
   If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
Packit 33f14e
Packit 33f14e
bool
Packit 33f14e
read_files (struct file_data filevec[], bool pretend_binary)
Packit 33f14e
{
Packit 33f14e
  int i;
Packit 33f14e
  bool skip_test = text | pretend_binary;
Packit 33f14e
  bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
Packit 33f14e
Packit 33f14e
  if (filevec[0].desc != filevec[1].desc)
Packit 33f14e
    appears_binary |= sip (&filevec[1], skip_test | appears_binary);
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      filevec[1].buffer = filevec[0].buffer;
Packit 33f14e
      filevec[1].bufsize = filevec[0].bufsize;
Packit 33f14e
      filevec[1].buffered = filevec[0].buffered;
Packit 33f14e
    }
Packit 33f14e
  if (appears_binary)
Packit 33f14e
    {
Packit 33f14e
      set_binary_mode (filevec[0].desc, O_BINARY);
Packit 33f14e
      set_binary_mode (filevec[1].desc, O_BINARY);
Packit 33f14e
      return true;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  find_identical_ends (filevec);
Packit 33f14e
Packit 33f14e
  equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
Packit 33f14e
  if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
Packit 33f14e
    xalloc_die ();
Packit 33f14e
  equivs = xmalloc (equivs_alloc * sizeof *equivs);
Packit 33f14e
  /* Equivalence class 0 is permanently safe for lines that were not
Packit 33f14e
     hashed.  Real equivalence classes start at 1.  */
Packit 33f14e
  equivs_index = 1;
Packit 33f14e
Packit 33f14e
  /* Allocate (one plus) a prime number of hash buckets.  Use a prime
Packit 33f14e
     number between 1/3 and 2/3 of the value of equiv_allocs,
Packit 33f14e
     approximately.  */
Packit 33f14e
  for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
Packit 33f14e
    continue;
Packit 33f14e
  nbuckets = ((size_t) 1 << i) - prime_offset[i];
Packit 33f14e
  if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
Packit 33f14e
    xalloc_die ();
Packit 33f14e
  buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
Packit 33f14e
  buckets++;
Packit 33f14e
Packit 33f14e
  for (i = 0; i < 2; i++)
Packit 33f14e
    find_and_hash_each_line (&filevec[i]);
Packit 33f14e
Packit 33f14e
  filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
Packit 33f14e
Packit 33f14e
  free (equivs);
Packit 33f14e
  free (buckets - 1);
Packit 33f14e
Packit 33f14e
  return false;
Packit 33f14e
}