Blob Blame History Raw
/*
 * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
 * Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>.
 * 	All rights reserved.
 * Redistribution and modifications are permitted subject to BSD license.
 */
#include <asn_system.h>
#include <xer_support.h>

/* Parser states */
typedef enum {
	ST_TEXT,
	ST_TAG_START,
	ST_TAG_BODY,
	ST_TAG_QUOTE_WAIT,
	ST_TAG_QUOTED_STRING,
	ST_TAG_UNQUOTED_STRING,
	ST_COMMENT_WAIT_DASH1,	/* "<!--"[1] */
	ST_COMMENT_WAIT_DASH2,	/* "<!--"[2] */
	ST_COMMENT,
	ST_COMMENT_CLO_DASH2,	/* "-->"[0] */
	ST_COMMENT_CLO_RT	/* "-->"[1] */
} pstate_e;

static pxml_chunk_type_e final_chunk_type[] = {
	PXML_TEXT,
	PXML_TAG_END,
	PXML_COMMENT_END,
	PXML_TAG_END,
	PXML_COMMENT_END,
};


static int
_charclass[256] = {
	0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
	0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
	1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
	2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0,	/* 01234567 89       */
	0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,	/*  ABCDEFG HIJKLMNO */
	3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0,	/* PQRSTUVW XYZ      */
	0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,	/*  abcdefg hijklmno */
	3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0	/* pqrstuvw xyz      */
};
#define WHITESPACE(c)	(_charclass[(unsigned char)(c)] == 1)
#define ALNUM(c)	(_charclass[(unsigned char)(c)] >= 2)
#define ALPHA(c)	(_charclass[(unsigned char)(c)] == 3)

/* Aliases for characters, ASCII/UTF-8 */
#define	EXCLAM	0x21	/* '!' */
#define	CQUOTE	0x22	/* '"' */
#define	CDASH	0x2d	/* '-' */
#define	CSLASH	0x2f	/* '/' */
#define	LANGLE	0x3c	/* '<' */
#define	CEQUAL	0x3d	/* '=' */
#define	RANGLE	0x3e	/* '>' */
#define	CQUEST	0x3f	/* '?' */

/* Invoke token callback */
#define	TOKEN_CB_CALL(type, _ns, _current_too, _final) do {	\
		int _ret;					\
		pstate_e ns  = _ns;				\
		ssize_t _sz = (p - chunk_start) + _current_too;	\
		if (!_sz) {					\
			/* Shortcut */				\
			state = _ns;				\
			break;					\
		}						\
		_ret = cb(type, chunk_start, _sz, key);		\
		if(_ret < _sz) {				\
			if(_current_too && _ret == -1)		\
				state = ns;			\
			goto finish;				\
		}						\
		chunk_start = p + _current_too;			\
		state = ns;					\
	} while(0)

#define TOKEN_CB(_type, _ns, _current_too)			\
	TOKEN_CB_CALL(_type, _ns, _current_too, 0)

#define TOKEN_CB_FINAL(_type, _ns, _current_too)		\
	TOKEN_CB_CALL(final_chunk_type[_type], _ns, _current_too, 1)

/*
 * Parser itself
 */
ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
	pstate_e state = (pstate_e)*stateContext;
	const char *chunk_start = (const char *)xmlbuf;
	const char *p = chunk_start;
	const char *end = p + size;

	for(; p < end; p++) {
	  int C = *(const unsigned char *)p;
	  switch(state) {
	  case ST_TEXT:
		/*
		 * Initial state: we're in the middle of some text,
		 * or just have started.
		 */
		if (C == LANGLE) 
			/* We're now in the tag, probably */
			TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
		break;
	  case ST_TAG_START:
		if (ALPHA(C) || (C == CSLASH))
			state = ST_TAG_BODY;
		else if (C == EXCLAM)
			state = ST_COMMENT_WAIT_DASH1;
		else 
			/*
			 * Not characters and not whitespace.
			 * Must be something like "3 < 4".
			 */
			TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
		break;
	  case ST_TAG_BODY:
		switch(C) {
		case RANGLE:
			/* End of the tag */
			TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
			break;
		case LANGLE:
			/*
			 * The previous tag wasn't completed, but still
			 * recognized as valid. (Mozilla-compatible)
			 */
			TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);	
			break;
		case CEQUAL:
			state = ST_TAG_QUOTE_WAIT;
			break;
		}
		break;
	  case ST_TAG_QUOTE_WAIT:
		/*
		 * State after the equal sign ("=") in the tag.
		 */
		switch(C) {
		case CQUOTE:
			state = ST_TAG_QUOTED_STRING;
			break;
		case RANGLE:
			/* End of the tag */
			TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
			break;
		default:
			if(!WHITESPACE(C))
				/* Unquoted string value */
				state = ST_TAG_UNQUOTED_STRING;
		}
		break;
	  case ST_TAG_QUOTED_STRING:
		/*
		 * Tag attribute's string value in quotes.
		 */
		if(C == CQUOTE) {
			/* Return back to the tag state */
			state = ST_TAG_BODY;
		}
		break;
	  case ST_TAG_UNQUOTED_STRING:
		if(C == RANGLE) {
			/* End of the tag */
			TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
		} else if(WHITESPACE(C)) {
			/* Return back to the tag state */
			state = ST_TAG_BODY;
		}
		break;
	  case ST_COMMENT_WAIT_DASH1:
		if(C == CDASH) {
			state = ST_COMMENT_WAIT_DASH2;
		} else {
			/* Some ordinary tag. */
			state = ST_TAG_BODY;
		}
		break;
	  case ST_COMMENT_WAIT_DASH2:
		if(C == CDASH) {
			/* Seen "<--" */
			state = ST_COMMENT;
		} else {
			/* Some ordinary tag */
			state = ST_TAG_BODY;
		}
		break;
	  case ST_COMMENT:
		if(C == CDASH) {
			state = ST_COMMENT_CLO_DASH2;
		}
		break;
	  case ST_COMMENT_CLO_DASH2:
		if(C == CDASH) {
			state = ST_COMMENT_CLO_RT;
		} else {
			/* This is not an end of a comment */
			state = ST_COMMENT;
		}
		break;
	  case ST_COMMENT_CLO_RT:
		if(C == RANGLE) {
			TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
		} else if(C == CDASH) {
			/* Maintain current state, still waiting for '>' */
		} else {
			state = ST_COMMENT;
		}
		break;
	  } /* switch(*ptr) */
	} /* for() */

	/*
	 * Flush the partially processed chunk, state permitting.
	 */
	if(p - chunk_start) {
		switch (state) {
		case ST_COMMENT:
			TOKEN_CB(PXML_COMMENT, state, 0);
			break;
		case ST_TEXT:
			TOKEN_CB(PXML_TEXT, state, 0);
			break;
		default: break;	/* a no-op */
		}
	}

finish:
	*stateContext = (int)state;
	return chunk_start - (const char *)xmlbuf;
}