Blame libarchive/archive_read_support_format_warc.c

Packit Service 1d0348
/*-
Packit Service 1d0348
 * Copyright (c) 2014 Sebastian Freundt
Packit Service 1d0348
 * All rights reserved.
Packit Service 1d0348
 *
Packit Service 1d0348
 * Redistribution and use in source and binary forms, with or without
Packit Service 1d0348
 * modification, are permitted provided that the following conditions
Packit Service 1d0348
 * are met:
Packit Service 1d0348
 * 1. Redistributions of source code must retain the above copyright
Packit Service 1d0348
 *    notice, this list of conditions and the following disclaimer.
Packit Service 1d0348
 * 2. Redistributions in binary form must reproduce the above copyright
Packit Service 1d0348
 *    notice, this list of conditions and the following disclaimer in the
Packit Service 1d0348
 *    documentation and/or other materials provided with the distribution.
Packit Service 1d0348
 *
Packit Service 1d0348
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
Packit Service 1d0348
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
Packit Service 1d0348
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
Packit Service 1d0348
 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
Packit Service 1d0348
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
Packit Service 1d0348
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
Packit Service 1d0348
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
Packit Service 1d0348
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
Packit Service 1d0348
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
Packit Service 1d0348
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Packit Service 1d0348
 */
Packit Service 1d0348
Packit Service 1d0348
#include "archive_platform.h"
Packit Service 1d0348
__FBSDID("$FreeBSD$");
Packit Service 1d0348
Packit Service 1d0348
/**
Packit Service 1d0348
 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
Packit Service 1d0348
 * ISO 28500:2009.
Packit Service 1d0348
 * For the purposes of this file we used the final draft from:
Packit Service 1d0348
 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
Packit Service 1d0348
 *
Packit Service 1d0348
 * Todo:
Packit Service 1d0348
 * [ ] real-world warcs can contain resources at endpoints ending in /
Packit Service 1d0348
 *     e.g. http://bibnum.bnf.fr/warc/
Packit Service 1d0348
 *     if you're lucky their response contains a Content-Location: header
Packit Service 1d0348
 *     pointing to a unix-compliant filename, in the example above it's
Packit Service 1d0348
 *     Content-Location: http://bibnum.bnf.fr/warc/index.html
Packit Service 1d0348
 *     however, that's not mandated and github for example doesn't follow
Packit Service 1d0348
 *     this convention.
Packit Service 1d0348
 *     We need a set of archive options to control what to do with
Packit Service 1d0348
 *     entries like these, at the moment care is taken to skip them.
Packit Service 1d0348
 *
Packit Service 1d0348
 **/
Packit Service 1d0348
Packit Service 1d0348
#ifdef HAVE_SYS_STAT_H
Packit Service 1d0348
#include <sys/stat.h>
Packit Service 1d0348
#endif
Packit Service 1d0348
#ifdef HAVE_ERRNO_H
Packit Service 1d0348
#include <errno.h>
Packit Service 1d0348
#endif
Packit Service 1d0348
#ifdef HAVE_STDLIB_H
Packit Service 1d0348
#include <stdlib.h>
Packit Service 1d0348
#endif
Packit Service 1d0348
#ifdef HAVE_STRING_H
Packit Service 1d0348
#include <string.h>
Packit Service 1d0348
#endif
Packit Service 1d0348
#ifdef HAVE_LIMITS_H
Packit Service 1d0348
#include <limits.h>
Packit Service 1d0348
#endif
Packit Service 1d0348
#ifdef HAVE_CTYPE_H
Packit Service 1d0348
#include <ctype.h>
Packit Service 1d0348
#endif
Packit Service 1d0348
#ifdef HAVE_TIME_H
Packit Service 1d0348
#include <time.h>
Packit Service 1d0348
#endif
Packit Service 1d0348
Packit Service 1d0348
#include "archive.h"
Packit Service 1d0348
#include "archive_entry.h"
Packit Service 1d0348
#include "archive_private.h"
Packit Service 1d0348
#include "archive_read_private.h"
Packit Service 1d0348
Packit Service 1d0348
typedef enum {
Packit Service 1d0348
	WT_NONE,
Packit Service 1d0348
	/* warcinfo */
Packit Service 1d0348
	WT_INFO,
Packit Service 1d0348
	/* metadata */
Packit Service 1d0348
	WT_META,
Packit Service 1d0348
	/* resource */
Packit Service 1d0348
	WT_RSRC,
Packit Service 1d0348
	/* request, unsupported */
Packit Service 1d0348
	WT_REQ,
Packit Service 1d0348
	/* response, unsupported */
Packit Service 1d0348
	WT_RSP,
Packit Service 1d0348
	/* revisit, unsupported */
Packit Service 1d0348
	WT_RVIS,
Packit Service 1d0348
	/* conversion, unsupported */
Packit Service 1d0348
	WT_CONV,
Packit Service 1d0348
	/* continuation, unsupported at the moment */
Packit Service 1d0348
	WT_CONT,
Packit Service 1d0348
	/* invalid type */
Packit Service 1d0348
	LAST_WT
Packit Service 1d0348
} warc_type_t;
Packit Service 1d0348
Packit Service 1d0348
typedef struct {
Packit Service 1d0348
	size_t len;
Packit Service 1d0348
	const char *str;
Packit Service 1d0348
} warc_string_t;
Packit Service 1d0348
Packit Service 1d0348
typedef struct {
Packit Service 1d0348
	size_t len;
Packit Service 1d0348
	char *str;
Packit Service 1d0348
} warc_strbuf_t;
Packit Service 1d0348
Packit Service 1d0348
struct warc_s {
Packit Service 1d0348
	/* content length ahead */
Packit Service 1d0348
	size_t cntlen;
Packit Service 1d0348
	/* and how much we've processed so far */
Packit Service 1d0348
	size_t cntoff;
Packit Service 1d0348
	/* and how much we need to consume between calls */
Packit Service 1d0348
	size_t unconsumed;
Packit Service 1d0348
Packit Service 1d0348
	/* string pool */
Packit Service 1d0348
	warc_strbuf_t pool;
Packit Service 1d0348
	/* previous version */
Packit Service 1d0348
	unsigned int pver;
Packit Service 1d0348
	/* stringified format name */
Packit Service 1d0348
	struct archive_string sver;
Packit Service 1d0348
};
Packit Service 1d0348
Packit Service 1d0348
static int _warc_bid(struct archive_read *a, int);
Packit Service 1d0348
static int _warc_cleanup(struct archive_read *a);
Packit Service 1d0348
static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
Packit Service 1d0348
static int _warc_skip(struct archive_read *a);
Packit Service 1d0348
static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
Packit Service 1d0348
Packit Service 1d0348
/* private routines */
Packit Service 1d0348
static unsigned int _warc_rdver(const char buf[10], size_t bsz);
Packit Service 1d0348
static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
Packit Service 1d0348
static warc_string_t _warc_rduri(const char *buf, size_t bsz);
Packit Service 1d0348
static ssize_t _warc_rdlen(const char *buf, size_t bsz);
Packit Service 1d0348
static time_t _warc_rdrtm(const char *buf, size_t bsz);
Packit Service 1d0348
static time_t _warc_rdmtm(const char *buf, size_t bsz);
Packit Service 1d0348
static const char *_warc_find_eoh(const char *buf, size_t bsz);
Packit Service 1d0348
static const char *_warc_find_eol(const char *buf, size_t bsz);
Packit Service 1d0348
Packit Service 1d0348
int
Packit Service 1d0348
archive_read_support_format_warc(struct archive *_a)
Packit Service 1d0348
{
Packit Service 1d0348
	struct archive_read *a = (struct archive_read *)_a;
Packit Service 1d0348
	struct warc_s *w;
Packit Service 1d0348
	int r;
Packit Service 1d0348
Packit Service 1d0348
	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
Packit Service 1d0348
	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
Packit Service 1d0348
Packit Service 1d0348
	if ((w = calloc(1, sizeof(*w))) == NULL) {
Packit Service 1d0348
		archive_set_error(&a->archive, ENOMEM,
Packit Service 1d0348
		    "Can't allocate warc data");
Packit Service 1d0348
		return (ARCHIVE_FATAL);
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	r = __archive_read_register_format(
Packit Service 1d0348
		a, w, "warc",
Packit Service 1d0348
		_warc_bid, NULL, _warc_rdhdr, _warc_read,
Packit Service 1d0348
		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
Packit Service 1d0348
Packit Service 1d0348
	if (r != ARCHIVE_OK) {
Packit Service 1d0348
		free(w);
Packit Service 1d0348
		return (r);
Packit Service 1d0348
	}
Packit Service 1d0348
	return (ARCHIVE_OK);
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static int
Packit Service 1d0348
_warc_cleanup(struct archive_read *a)
Packit Service 1d0348
{
Packit Service 1d0348
	struct warc_s *w = a->format->data;
Packit Service 1d0348
Packit Service 1d0348
	if (w->pool.len > 0U) {
Packit Service 1d0348
		free(w->pool.str);
Packit Service 1d0348
	}
Packit Service 1d0348
	archive_string_free(&w->sver);
Packit Service 1d0348
	free(w);
Packit Service 1d0348
	a->format->data = NULL;
Packit Service 1d0348
	return (ARCHIVE_OK);
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static int
Packit Service 1d0348
_warc_bid(struct archive_read *a, int best_bid)
Packit Service 1d0348
{
Packit Service 1d0348
	const char *hdr;
Packit Service 1d0348
	ssize_t nrd;
Packit Service 1d0348
	unsigned int ver;
Packit Service 1d0348
Packit Service 1d0348
	(void)best_bid; /* UNUSED */
Packit Service 1d0348
Packit Service 1d0348
	/* check first line of file, it should be a record already */
Packit Service 1d0348
	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
Packit Service 1d0348
		/* no idea what to do */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	} else if (nrd < 12) {
Packit Service 1d0348
		/* nah, not for us, our magic cookie is at least 12 bytes */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* otherwise snarf the record's version number */
Packit Service 1d0348
	ver = _warc_rdver(hdr, nrd);
Packit Service 1d0348
	if (ver < 1200U || ver > 10000U) {
Packit Service 1d0348
		/* we only support WARC 0.12 to 1.0 */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* otherwise be confident */
Packit Service 1d0348
	return (64);
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static int
Packit Service 1d0348
_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
Packit Service 1d0348
{
Packit Service 1d0348
#define HDR_PROBE_LEN		(12U)
Packit Service 1d0348
	struct warc_s *w = a->format->data;
Packit Service 1d0348
	unsigned int ver;
Packit Service 1d0348
	const char *buf;
Packit Service 1d0348
	ssize_t nrd;
Packit Service 1d0348
	const char *eoh;
Packit Service 1d0348
	/* for the file name, saves some strndup()'ing */
Packit Service 1d0348
	warc_string_t fnam;
Packit Service 1d0348
	/* warc record type, not that we really use it a lot */
Packit Service 1d0348
	warc_type_t ftyp;
Packit Service 1d0348
	/* content-length+error monad */
Packit Service 1d0348
	ssize_t cntlen;
Packit Service 1d0348
	/* record time is the WARC-Date time we reinterpret it as ctime */
Packit Service 1d0348
	time_t rtime;
Packit Service 1d0348
	/* mtime is the Last-Modified time which will be the entry's mtime */
Packit Service 1d0348
	time_t mtime;
Packit Service 1d0348
Packit Service 1d0348
start_over:
Packit Service 1d0348
	/* just use read_ahead() they keep track of unconsumed
Packit Service 1d0348
	 * bits and bobs for us; no need to put an extra shift in
Packit Service 1d0348
	 * and reproduce that functionality here */
Packit Service 1d0348
	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd;;
Packit Service 1d0348
Packit Service 1d0348
	if (nrd < 0) {
Packit Service 1d0348
		/* no good */
Packit Service 1d0348
		archive_set_error(
Packit Service 1d0348
			&a->archive, ARCHIVE_ERRNO_MISC,
Packit Service 1d0348
			"Bad record header");
Packit Service 1d0348
		return (ARCHIVE_FATAL);
Packit Service 1d0348
	} else if (buf == NULL) {
Packit Service 1d0348
		/* there should be room for at least WARC/bla\r\n
Packit Service 1d0348
		 * must be EOF therefore */
Packit Service 1d0348
		return (ARCHIVE_EOF);
Packit Service 1d0348
	}
Packit Service 1d0348
 	/* looks good so far, try and find the end of the header now */
Packit Service 1d0348
	eoh = _warc_find_eoh(buf, nrd);
Packit Service 1d0348
	if (eoh == NULL) {
Packit Service 1d0348
		/* still no good, the header end might be beyond the
Packit Service 1d0348
		 * probe we've requested, but then again who'd cram
Packit Service 1d0348
		 * so much stuff into the header *and* be 28500-compliant */
Packit Service 1d0348
		archive_set_error(
Packit Service 1d0348
			&a->archive, ARCHIVE_ERRNO_MISC,
Packit Service 1d0348
			"Bad record header");
Packit Service 1d0348
		return (ARCHIVE_FATAL);
Packit Service 1d0348
	}
Packit Service 1d0348
	ver = _warc_rdver(buf, eoh - buf);
Packit Service 1d0348
	/* we currently support WARC 0.12 to 1.0 */
Packit Service 1d0348
	if (ver == 0U) {
Packit Service 1d0348
		archive_set_error(
Packit Service 1d0348
			&a->archive, ARCHIVE_ERRNO_MISC,
Packit Service 1d0348
			"Invalid record version");
Packit Service 1d0348
		return (ARCHIVE_FATAL);
Packit Service 1d0348
	} else if (ver < 1200U || ver > 10000U) {
Packit Service 1d0348
		archive_set_error(
Packit Service 1d0348
			&a->archive, ARCHIVE_ERRNO_MISC,
Packit Service 1d0348
			"Unsupported record version: %u.%u",
Packit Service 1d0348
			ver / 10000, (ver % 10000) / 100);
Packit Service 1d0348
		return (ARCHIVE_FATAL);
Packit Service 1d0348
	}
Packit Service 1d0348
	cntlen = _warc_rdlen(buf, eoh - buf);
Packit Service 1d0348
	if (cntlen < 0) {
Packit Service 1d0348
		/* nightmare!  the specs say content-length is mandatory
Packit Service 1d0348
		 * so I don't feel overly bad stopping the reader here */
Packit Service 1d0348
		archive_set_error(
Packit Service 1d0348
			&a->archive, EINVAL,
Packit Service 1d0348
			"Bad content length");
Packit Service 1d0348
		return (ARCHIVE_FATAL);
Packit Service 1d0348
	}
Packit Service 1d0348
	rtime = _warc_rdrtm(buf, eoh - buf);
Packit Service 1d0348
	if (rtime == (time_t)-1) {
Packit Service 1d0348
		/* record time is mandatory as per WARC/1.0,
Packit Service 1d0348
		 * so just barf here, fast and loud */
Packit Service 1d0348
		archive_set_error(
Packit Service 1d0348
			&a->archive, EINVAL,
Packit Service 1d0348
			"Bad record time");
Packit Service 1d0348
		return (ARCHIVE_FATAL);
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* let the world know we're a WARC archive */
Packit Service 1d0348
	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
Packit Service 1d0348
	if (ver != w->pver) {
Packit Service 1d0348
		/* stringify this entry's version */
Packit Service 1d0348
		archive_string_sprintf(&w->sver,
Packit Service 1d0348
			"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
Packit Service 1d0348
		/* remember the version */
Packit Service 1d0348
		w->pver = ver;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* start off with the type */
Packit Service 1d0348
	ftyp = _warc_rdtyp(buf, eoh - buf);
Packit Service 1d0348
	/* and let future calls know about the content */
Packit Service 1d0348
	w->cntlen = cntlen;
Packit Service 1d0348
	w->cntoff = 0U;
Packit Service 1d0348
	mtime = 0;/* Avoid compiling error on some platform. */
Packit Service 1d0348
Packit Service 1d0348
	switch (ftyp) {
Packit Service 1d0348
	case WT_RSRC:
Packit Service 1d0348
	case WT_RSP:
Packit Service 1d0348
		/* only try and read the filename in the cases that are
Packit Service 1d0348
		 * guaranteed to have one */
Packit Service 1d0348
		fnam = _warc_rduri(buf, eoh - buf);
Packit Service 1d0348
		/* check the last character in the URI to avoid creating
Packit Service 1d0348
		 * directory endpoints as files, see Todo above */
Packit Service 1d0348
		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
Packit Service 1d0348
			/* break here for now */
Packit Service 1d0348
			fnam.len = 0U;
Packit Service 1d0348
			fnam.str = NULL;
Packit Service 1d0348
			break;
Packit Service 1d0348
		}
Packit Service 1d0348
		/* bang to our string pool, so we save a
Packit Service 1d0348
		 * malloc()+free() roundtrip */
Packit Service 1d0348
		if (fnam.len + 1U > w->pool.len) {
Packit Service 1d0348
			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
Packit Service 1d0348
			w->pool.str = realloc(w->pool.str, w->pool.len);
Packit Service 1d0348
		}
Packit Service 1d0348
		memcpy(w->pool.str, fnam.str, fnam.len);
Packit Service 1d0348
		w->pool.str[fnam.len] = '\0';
Packit Service 1d0348
		/* let no one else know about the pool, it's a secret, shhh */
Packit Service 1d0348
		fnam.str = w->pool.str;
Packit Service 1d0348
Packit Service 1d0348
		/* snarf mtime or deduce from rtime
Packit Service 1d0348
		 * this is a custom header added by our writer, it's quite
Packit Service 1d0348
		 * hard to believe anyone else would go through with it
Packit Service 1d0348
		 * (apart from being part of some http responses of course) */
Packit Service 1d0348
		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
Packit Service 1d0348
			mtime = rtime;
Packit Service 1d0348
		}
Packit Service 1d0348
		break;
Packit Service 1d0348
	default:
Packit Service 1d0348
		fnam.len = 0U;
Packit Service 1d0348
		fnam.str = NULL;
Packit Service 1d0348
		break;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* now eat some of those delicious buffer bits */
Packit Service 1d0348
	__archive_read_consume(a, eoh - buf);
Packit Service 1d0348
Packit Service 1d0348
	switch (ftyp) {
Packit Service 1d0348
	case WT_RSRC:
Packit Service 1d0348
	case WT_RSP:
Packit Service 1d0348
		if (fnam.len > 0U) {
Packit Service 1d0348
			/* populate entry object */
Packit Service 1d0348
			archive_entry_set_filetype(entry, AE_IFREG);
Packit Service 1d0348
			archive_entry_copy_pathname(entry, fnam.str);
Packit Service 1d0348
			archive_entry_set_size(entry, cntlen);
Packit Service 1d0348
			archive_entry_set_perm(entry, 0644);
Packit Service 1d0348
			/* rtime is the new ctime, mtime stays mtime */
Packit Service 1d0348
			archive_entry_set_ctime(entry, rtime, 0L);
Packit Service 1d0348
			archive_entry_set_mtime(entry, mtime, 0L);
Packit Service 1d0348
			break;
Packit Service 1d0348
		}
Packit Service 1d0348
		/* FALLTHROUGH */
Packit Service 1d0348
	default:
Packit Service 1d0348
		/* consume the content and start over */
Packit Service 1d0348
		_warc_skip(a);
Packit Service 1d0348
		goto start_over;
Packit Service 1d0348
	}
Packit Service 1d0348
	return (ARCHIVE_OK);
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static int
Packit Service 1d0348
_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
Packit Service 1d0348
{
Packit Service 1d0348
	struct warc_s *w = a->format->data;
Packit Service 1d0348
	const char *rab;
Packit Service 1d0348
	ssize_t nrd;
Packit Service 1d0348
Packit Service 1d0348
	if (w->cntoff >= w->cntlen) {
Packit Service 1d0348
	eof:
Packit Service 1d0348
		/* it's our lucky day, no work, we can leave early */
Packit Service 1d0348
		*buf = NULL;
Packit Service 1d0348
		*bsz = 0U;
Packit Service 1d0348
		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
Packit Service 1d0348
		w->unconsumed = 0U;
Packit Service 1d0348
		return (ARCHIVE_EOF);
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	rab = __archive_read_ahead(a, 1U, &nrd;;
Packit Service 1d0348
	if (nrd < 0) {
Packit Service 1d0348
		*bsz = 0U;
Packit Service 1d0348
		/* big catastrophe */
Packit Service 1d0348
		return (int)nrd;
Packit Service 1d0348
	} else if (nrd == 0) {
Packit Service 1d0348
		goto eof;
Packit Service 1d0348
	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
Packit Service 1d0348
		/* clamp to content-length */
Packit Service 1d0348
		nrd = w->cntlen - w->cntoff;
Packit Service 1d0348
	}
Packit Service 1d0348
	*off = w->cntoff;
Packit Service 1d0348
	*bsz = nrd;
Packit Service 1d0348
	*buf = rab;
Packit Service 1d0348
Packit Service 1d0348
	w->cntoff += nrd;
Packit Service 1d0348
	w->unconsumed = (size_t)nrd;
Packit Service 1d0348
	return (ARCHIVE_OK);
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static int
Packit Service 1d0348
_warc_skip(struct archive_read *a)
Packit Service 1d0348
{
Packit Service 1d0348
	struct warc_s *w = a->format->data;
Packit Service 1d0348
Packit Service 1d0348
	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
Packit Service 1d0348
	w->cntlen = 0U;
Packit Service 1d0348
	w->cntoff = 0U;
Packit Service 1d0348
	return (ARCHIVE_OK);
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348

Packit Service 1d0348
/* private routines */
Packit Service 1d0348
static void*
Packit Service 1d0348
deconst(const void *c)
Packit Service 1d0348
{
Packit Service 1d0348
	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static char*
Packit Service 1d0348
xmemmem(const char *hay, const size_t haysize,
Packit Service 1d0348
	const char *needle, const size_t needlesize)
Packit Service 1d0348
{
Packit Service 1d0348
	const char *const eoh = hay + haysize;
Packit Service 1d0348
	const char *const eon = needle + needlesize;
Packit Service 1d0348
	const char *hp;
Packit Service 1d0348
	const char *np;
Packit Service 1d0348
	const char *cand;
Packit Service 1d0348
	unsigned int hsum;
Packit Service 1d0348
	unsigned int nsum;
Packit Service 1d0348
	unsigned int eqp;
Packit Service 1d0348
Packit Service 1d0348
	/* trivial checks first
Packit Service 1d0348
         * a 0-sized needle is defined to be found anywhere in haystack
Packit Service 1d0348
         * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
Packit Service 1d0348
         * that happens to begin with *NEEDLE) */
Packit Service 1d0348
	if (needlesize == 0UL) {
Packit Service 1d0348
		return deconst(hay);
Packit Service 1d0348
	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
Packit Service 1d0348
		/* trivial */
Packit Service 1d0348
		return NULL;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* First characters of haystack and needle are the same now. Both are
Packit Service 1d0348
	 * guaranteed to be at least one character long.  Now computes the sum
Packit Service 1d0348
	 * of characters values of needle together with the sum of the first
Packit Service 1d0348
	 * needle_len characters of haystack. */
Packit Service 1d0348
	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
Packit Service 1d0348
	     hp < eoh && np < eon;
Packit Service 1d0348
	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
Packit Service 1d0348
Packit Service 1d0348
	/* HP now references the (NEEDLESIZE + 1)-th character. */
Packit Service 1d0348
	if (np < eon) {
Packit Service 1d0348
		/* haystack is smaller than needle, :O */
Packit Service 1d0348
		return NULL;
Packit Service 1d0348
	} else if (eqp) {
Packit Service 1d0348
		/* found a match */
Packit Service 1d0348
		return deconst(hay);
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* now loop through the rest of haystack,
Packit Service 1d0348
	 * updating the sum iteratively */
Packit Service 1d0348
	for (cand = hay; hp < eoh; hp++) {
Packit Service 1d0348
		hsum ^= *cand++;
Packit Service 1d0348
		hsum ^= *hp;
Packit Service 1d0348
Packit Service 1d0348
		/* Since the sum of the characters is already known to be
Packit Service 1d0348
		 * equal at that point, it is enough to check just NEEDLESIZE - 1
Packit Service 1d0348
		 * characters for equality,
Packit Service 1d0348
		 * also CAND is by design < HP, so no need for range checks */
Packit Service 1d0348
		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
Packit Service 1d0348
			return deconst(cand);
Packit Service 1d0348
		}
Packit Service 1d0348
	}
Packit Service 1d0348
	return NULL;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static int
Packit Service 1d0348
strtoi_lim(const char *str, const char **ep, int llim, int ulim)
Packit Service 1d0348
{
Packit Service 1d0348
	int res = 0;
Packit Service 1d0348
	const char *sp;
Packit Service 1d0348
	/* we keep track of the number of digits via rulim */
Packit Service 1d0348
	int rulim;
Packit Service 1d0348
Packit Service 1d0348
	for (sp = str, rulim = ulim > 10 ? ulim : 10;
Packit Service 1d0348
	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
Packit Service 1d0348
	     sp++, rulim /= 10) {
Packit Service 1d0348
		res *= 10;
Packit Service 1d0348
		res += *sp - '0';
Packit Service 1d0348
	}
Packit Service 1d0348
	if (sp == str) {
Packit Service 1d0348
		res = -1;
Packit Service 1d0348
	} else if (res < llim || res > ulim) {
Packit Service 1d0348
		res = -2;
Packit Service 1d0348
	}
Packit Service 1d0348
	*ep = (const char*)sp;
Packit Service 1d0348
	return res;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static time_t
Packit Service 1d0348
time_from_tm(struct tm *t)
Packit Service 1d0348
{
Packit Service 1d0348
#if HAVE_TIMEGM
Packit Service 1d0348
        /* Use platform timegm() if available. */
Packit Service 1d0348
        return (timegm(t));
Packit Service 1d0348
#elif HAVE__MKGMTIME64
Packit Service 1d0348
        return (_mkgmtime64(t));
Packit Service 1d0348
#else
Packit Service 1d0348
        /* Else use direct calculation using POSIX assumptions. */
Packit Service 1d0348
        /* First, fix up tm_yday based on the year/month/day. */
Packit Service 1d0348
        if (mktime(t) == (time_t)-1)
Packit Service 1d0348
                return ((time_t)-1);
Packit Service 1d0348
        /* Then we can compute timegm() from first principles. */
Packit Service 1d0348
        return (t->tm_sec
Packit Service 1d0348
            + t->tm_min * 60
Packit Service 1d0348
            + t->tm_hour * 3600
Packit Service 1d0348
            + t->tm_yday * 86400
Packit Service 1d0348
            + (t->tm_year - 70) * 31536000
Packit Service 1d0348
            + ((t->tm_year - 69) / 4) * 86400
Packit Service 1d0348
            - ((t->tm_year - 1) / 100) * 86400
Packit Service 1d0348
            + ((t->tm_year + 299) / 400) * 86400);
Packit Service 1d0348
#endif
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static time_t
Packit Service 1d0348
xstrpisotime(const char *s, char **endptr)
Packit Service 1d0348
{
Packit Service 1d0348
/** like strptime() but strictly for ISO 8601 Zulu strings */
Packit Service 1d0348
	struct tm tm;
Packit Service 1d0348
	time_t res = (time_t)-1;
Packit Service 1d0348
Packit Service 1d0348
	/* make sure tm is clean */
Packit Service 1d0348
	memset(&tm, 0, sizeof(tm));
Packit Service 1d0348
Packit Service 1d0348
	/* as a courtesy to our callers, and since this is a non-standard
Packit Service 1d0348
	 * routine, we skip leading whitespace */
Packit Service 1d0348
	while (*s == ' ' || *s == '\t')
Packit Service 1d0348
		++s;
Packit Service 1d0348
Packit Service 1d0348
	/* read year */
Packit Service 1d0348
	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
Packit Service 1d0348
		goto out;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* read month */
Packit Service 1d0348
	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
Packit Service 1d0348
		goto out;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* read day-of-month */
Packit Service 1d0348
	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
Packit Service 1d0348
		goto out;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* read hour */
Packit Service 1d0348
	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
Packit Service 1d0348
		goto out;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* read minute */
Packit Service 1d0348
	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
Packit Service 1d0348
		goto out;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* read second */
Packit Service 1d0348
	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
Packit Service 1d0348
		goto out;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* massage TM to fulfill some of POSIX' constraints */
Packit Service 1d0348
	tm.tm_year -= 1900;
Packit Service 1d0348
	tm.tm_mon--;
Packit Service 1d0348
Packit Service 1d0348
	/* now convert our custom tm struct to a unix stamp using UTC */
Packit Service 1d0348
	res = time_from_tm(&tm;;
Packit Service 1d0348
Packit Service 1d0348
out:
Packit Service 1d0348
	if (endptr != NULL) {
Packit Service 1d0348
		*endptr = deconst(s);
Packit Service 1d0348
	}
Packit Service 1d0348
	return res;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static unsigned int
Packit Service 1d0348
_warc_rdver(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char magic[] = "WARC/";
Packit Service 1d0348
	const char *c;
Packit Service 1d0348
	unsigned int ver = 0U;
Packit Service 1d0348
	unsigned int end = 0U;
Packit Service 1d0348
Packit Service 1d0348
	if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
Packit Service 1d0348
		/* buffer too small or invalid magic */
Packit Service 1d0348
		return ver;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* looks good so far, read the version number for a laugh */
Packit Service 1d0348
	buf += sizeof(magic) - 1U;
Packit Service 1d0348
Packit Service 1d0348
	if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
Packit Service 1d0348
	    isdigit((unsigned char)buf[2U])) {
Packit Service 1d0348
		/* we support a maximum of 2 digits in the minor version */
Packit Service 1d0348
		if (isdigit((unsigned char)buf[3U]))
Packit Service 1d0348
			end = 1U;
Packit Service 1d0348
		/* set up major version */
Packit Service 1d0348
		ver = (buf[0U] - '0') * 10000U;
Packit Service 1d0348
		/* set up minor version */
Packit Service 1d0348
		if (end == 1U) {
Packit Service 1d0348
			ver += (buf[2U] - '0') * 1000U;
Packit Service 1d0348
			ver += (buf[3U] - '0') * 100U;
Packit Service 1d0348
		} else
Packit Service 1d0348
			ver += (buf[2U] - '0') * 100U;
Packit Service 1d0348
		/*
Packit Service 1d0348
		 * WARC below version 0.12 has a space-separated header
Packit Service 1d0348
		 * WARC 0.12 and above terminates the version with a CRLF
Packit Service 1d0348
		 */
Packit Service 1d0348
		c = buf + 3U + end;
Packit Service 1d0348
		if (ver >= 1200U) {
Packit Service 1d0348
			if (memcmp(c, "\r\n", 2U) != 0)
Packit Service 1d0348
				ver = 0U;
Packit Service 1d0348
		} else if (ver < 1200U) {
Packit Service 1d0348
			if (*c != ' ' && *c != '\t')
Packit Service 1d0348
				ver = 0U;
Packit Service 1d0348
		}
Packit Service 1d0348
	}
Packit Service 1d0348
	return ver;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static unsigned int
Packit Service 1d0348
_warc_rdtyp(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char _key[] = "\r\nWARC-Type:";
Packit Service 1d0348
	const char *val, *eol;
Packit Service 1d0348
Packit Service 1d0348
	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
Packit Service 1d0348
		/* no bother */
Packit Service 1d0348
		return WT_NONE;
Packit Service 1d0348
	}
Packit Service 1d0348
	val += sizeof(_key) - 1U;
Packit Service 1d0348
	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
Packit Service 1d0348
		/* no end of line */
Packit Service 1d0348
		return WT_NONE;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* overread whitespace */
Packit Service 1d0348
	while (val < eol && (*val == ' ' || *val == '\t'))
Packit Service 1d0348
		++val;
Packit Service 1d0348
Packit Service 1d0348
	if (val + 8U == eol) {
Packit Service 1d0348
		if (memcmp(val, "resource", 8U) == 0)
Packit Service 1d0348
			return WT_RSRC;
Packit Service 1d0348
		else if (memcmp(val, "response", 8U) == 0)
Packit Service 1d0348
			return WT_RSP;
Packit Service 1d0348
	}
Packit Service 1d0348
	return WT_NONE;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static warc_string_t
Packit Service 1d0348
_warc_rduri(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char _key[] = "\r\nWARC-Target-URI:";
Packit Service 1d0348
	const char *val, *uri, *eol, *p;
Packit Service 1d0348
	warc_string_t res = {0U, NULL};
Packit Service 1d0348
Packit Service 1d0348
	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
Packit Service 1d0348
		/* no bother */
Packit Service 1d0348
		return res;
Packit Service 1d0348
	}
Packit Service 1d0348
	/* overread whitespace */
Packit Service 1d0348
	val += sizeof(_key) - 1U;
Packit Service 1d0348
	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
Packit Service 1d0348
		/* no end of line */
Packit Service 1d0348
		return res;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	while (val < eol && (*val == ' ' || *val == '\t'))
Packit Service 1d0348
		++val;
Packit Service 1d0348
Packit Service 1d0348
	/* overread URL designators */
Packit Service 1d0348
	if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
Packit Service 1d0348
		/* not touching that! */
Packit Service 1d0348
		return res;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* spaces inside uri are not allowed, CRLF should follow */
Packit Service 1d0348
	for (p = val; p < eol; p++) {
Packit Service 1d0348
		if (isspace((unsigned char)*p))
Packit Service 1d0348
			return res;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* there must be at least space for ftp */
Packit Service 1d0348
	if (uri < (val + 3U))
Packit Service 1d0348
		return res;
Packit Service 1d0348
Packit Service 1d0348
	/* move uri to point to after :// */
Packit Service 1d0348
	uri += 3U;
Packit Service 1d0348
Packit Service 1d0348
	/* now then, inspect the URI */
Packit Service 1d0348
	if (memcmp(val, "file", 4U) == 0) {
Packit Service 1d0348
		/* perfect, nothing left to do here */
Packit Service 1d0348
Packit Service 1d0348
	} else if (memcmp(val, "http", 4U) == 0 ||
Packit Service 1d0348
		   memcmp(val, "ftp", 3U) == 0) {
Packit Service 1d0348
		/* overread domain, and the first / */
Packit Service 1d0348
		while (uri < eol && *uri++ != '/');
Packit Service 1d0348
	} else {
Packit Service 1d0348
		/* not sure what to do? best to bugger off */
Packit Service 1d0348
		return res;
Packit Service 1d0348
	}
Packit Service 1d0348
	res.str = uri;
Packit Service 1d0348
	res.len = eol - uri;
Packit Service 1d0348
	return res;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static ssize_t
Packit Service 1d0348
_warc_rdlen(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char _key[] = "\r\nContent-Length:";
Packit Service 1d0348
	const char *val, *eol;
Packit Service 1d0348
	char *on = NULL;
Packit Service 1d0348
	long int len;
Packit Service 1d0348
Packit Service 1d0348
	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
Packit Service 1d0348
		/* no bother */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
	val += sizeof(_key) - 1U;
Packit Service 1d0348
	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
Packit Service 1d0348
		/* no end of line */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* skip leading whitespace */
Packit Service 1d0348
	while (val < eol && (*val == ' ' || *val == '\t'))
Packit Service 1d0348
		val++;
Packit Service 1d0348
	/* there must be at least one digit */
Packit Service 1d0348
	if (!isdigit((unsigned char)*val))
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	len = strtol(val, &on, 10);
Packit Service 1d0348
	if (on != eol) {
Packit Service 1d0348
		/* line must end here */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	return (size_t)len;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static time_t
Packit Service 1d0348
_warc_rdrtm(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char _key[] = "\r\nWARC-Date:";
Packit Service 1d0348
	const char *val, *eol;
Packit Service 1d0348
	char *on = NULL;
Packit Service 1d0348
	time_t res;
Packit Service 1d0348
Packit Service 1d0348
	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
Packit Service 1d0348
		/* no bother */
Packit Service 1d0348
		return (time_t)-1;
Packit Service 1d0348
	}
Packit Service 1d0348
	val += sizeof(_key) - 1U;
Packit Service 1d0348
	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
Packit Service 1d0348
		/* no end of line */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* xstrpisotime() kindly overreads whitespace for us, so use that */
Packit Service 1d0348
	res = xstrpisotime(val, &on;;
Packit Service 1d0348
	if (on != eol) {
Packit Service 1d0348
		/* line must end here */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
	return res;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static time_t
Packit Service 1d0348
_warc_rdmtm(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char _key[] = "\r\nLast-Modified:";
Packit Service 1d0348
	const char *val, *eol;
Packit Service 1d0348
	char *on = NULL;
Packit Service 1d0348
	time_t res;
Packit Service 1d0348
Packit Service 1d0348
	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
Packit Service 1d0348
		/* no bother */
Packit Service 1d0348
		return (time_t)-1;
Packit Service 1d0348
	}
Packit Service 1d0348
	val += sizeof(_key) - 1U;
Packit Service 1d0348
	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
Packit Service 1d0348
		/* no end of line */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
Packit Service 1d0348
	/* xstrpisotime() kindly overreads whitespace for us, so use that */
Packit Service 1d0348
	res = xstrpisotime(val, &on;;
Packit Service 1d0348
	if (on != eol) {
Packit Service 1d0348
		/* line must end here */
Packit Service 1d0348
		return -1;
Packit Service 1d0348
	}
Packit Service 1d0348
	return res;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static const char*
Packit Service 1d0348
_warc_find_eoh(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char _marker[] = "\r\n\r\n";
Packit Service 1d0348
	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
Packit Service 1d0348
Packit Service 1d0348
	if (hit != NULL) {
Packit Service 1d0348
		hit += sizeof(_marker) - 1U;
Packit Service 1d0348
	}
Packit Service 1d0348
	return hit;
Packit Service 1d0348
}
Packit Service 1d0348
Packit Service 1d0348
static const char*
Packit Service 1d0348
_warc_find_eol(const char *buf, size_t bsz)
Packit Service 1d0348
{
Packit Service 1d0348
	static const char _marker[] = "\r\n";
Packit Service 1d0348
	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
Packit Service 1d0348
Packit Service 1d0348
	return hit;
Packit Service 1d0348
}
Packit Service 1d0348
/* archive_read_support_format_warc.c ends here */