// Copyright (c) 2018 Dr. Colin Hirsch and Daniel Frey
// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
#include <algorithm>
#include <iostream>
#include <iterator>
#include <set>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include <cassert>
#include <cctype>
#include <cstdlib>
#if defined( _MSC_VER )
#include <string.h>
#define TAO_PEGTL_STRCASECMP _stricmp
#else
#include <strings.h>
#define TAO_PEGTL_STRCASECMP strcasecmp
#endif
#include <tao/pegtl.hpp>
#include <tao/pegtl/analyze.hpp>
#include <tao/pegtl/contrib/abnf.hpp>
#include <tao/pegtl/contrib/parse_tree.hpp>
namespace tao
{
namespace TAO_PEGTL_NAMESPACE
{
namespace abnf
{
namespace
{
std::string prefix = "tao::pegtl::"; // NOLINT
// clang-format off
std::set< std::string > keywords = { // NOLINT
"alignas", "alignof", "and", "and_eq",
"asm", "auto", "bitand", "bitor",
"bool", "break", "case", "catch",
"char", "char16_t", "char32_t", "class",
"compl", "const", "constexpr", "const_cast",
"continue", "decltype", "default", "delete",
"do", "double", "dynamic_cast", "else",
"enum", "explicit", "export", "extern",
"false", "float", "for", "friend",
"goto", "if", "inline", "int",
"long", "mutable", "namespace", "new",
"noexcept", "not", "not_eq", "nullptr",
"operator", "or", "or_eq", "private",
"protected", "public", "register", "reinterpret_cast",
"return", "short", "signed", "sizeof",
"static", "static_assert", "static_cast", "struct",
"switch", "template", "this", "thread_local",
"throw", "true", "try", "typedef",
"typeid", "typename", "union", "unsigned",
"using", "virtual", "void", "volatile",
"wchar_t", "while", "xor", "xor_eq"
};
// clang-format on
using rules_t = std::vector< std::string >;
rules_t rules_defined; // NOLINT
rules_t rules; // NOLINT
// clang-format off
struct one_tag {};
struct string_tag {};
struct istring_tag {};
// clang-format on
rules_t::reverse_iterator find_rule( rules_t& r, const std::string& v, const rules_t::reverse_iterator& rbegin )
{
return std::find_if( rbegin, r.rend(), [&]( const rules_t::value_type& p ) { return TAO_PEGTL_STRCASECMP( p.c_str(), v.c_str() ) == 0; } );
}
rules_t::reverse_iterator find_rule( rules_t& r, const std::string& v )
{
return find_rule( r, v, r.rbegin() );
}
bool append_char( std::string& s, const char c )
{
if( !s.empty() ) {
s += ", ";
}
s += '\'';
if( c == '\'' || c == '\\' ) {
s += '\\';
}
s += c;
s += '\'';
return std::isalpha( c ) != 0;
}
std::string remove_leading_zeroes( const std::string& v )
{
const auto pos = v.find_first_not_of( '0' );
if( pos == std::string::npos ) {
return "";
}
return v.substr( pos );
}
void shift( internal::iterator& it, int delta )
{
it.data += delta;
it.byte += delta;
it.byte_in_line += delta;
}
} // namespace
namespace grammar
{
// ABNF grammar according to RFC 5234, updated by RFC 7405, with
// the following differences:
//
// To form a C++ identifier from a rulename, all minuses are
// replaced with underscores.
//
// As C++ identifiers are case-sensitive, we remember the "correct"
// spelling from the first occurrence of a rulename, all other
// occurrences are automatically changed to that.
//
// Certain rulenames are reserved as their equivalent C++ identifier is
// reserved as a keyword, an alternative token, by the standard or
// for other, special reasons.
//
// When using numerical values (num-val, repeat), the values
// must be in the range of the corresponsing C++ data type.
//
// Remember we are defining a PEG, not a CFG. Simply copying some
// ABNF from somewhere might lead to surprising results as the
// alternations are now sequential, using the sor<> rule.
//
// PEG also require two extensions: the and-predicate and the
// not-predicate. They are expressed by '&' and '!' respectively,
// being allowed (optionally, only one of them) before the
// repetition. You can use braces for more complex expressions.
//
// Finally, instead of the pre-defined CRLF sequence, we accept
// any type of line ending as a convencience extension:
// clang-format off
struct CRLF : sor< abnf::CRLF, CR, LF > {};
// The rest is according to the RFC(s):
struct comment_cont : until< CRLF, sor< WSP, VCHAR > > {};
struct comment : if_must< one< ';' >, comment_cont > {};
struct c_nl : sor< comment, CRLF > {};
struct c_wsp : sor< WSP, seq< c_nl, WSP > > {};
struct rulename : seq< ALPHA, star< ranges< 'a', 'z', 'A', 'Z', '0', '9', '-' > > > {};
struct quoted_string_cont : until< DQUOTE, print > {};
struct quoted_string : if_must< DQUOTE, quoted_string_cont > {};
struct case_insensitive_string : seq< opt< istring< '%', 'i' > >, quoted_string > {};
struct case_sensitive_string : seq< istring< '%', 's' >, quoted_string > {};
struct char_val : sor< case_insensitive_string, case_sensitive_string > {};
struct prose_val_cont : until< one< '>' >, print > {};
struct prose_val : if_must< one< '<' >, prose_val_cont > {};
template< char First, typename Digit >
struct gen_val
{
struct value : plus< Digit > {};
struct range : if_must< one< '-' >, value > {};
struct next_value : must< value > {};
struct type : seq< istring< First >, must< value >, sor< range, star< one< '.' >, next_value > > > {};
};
using hex_val = gen_val< 'x', HEXDIG >;
using dec_val = gen_val< 'd', DIGIT >;
using bin_val = gen_val< 'b', BIT >;
struct num_val_choice : sor< bin_val::type, dec_val::type, hex_val::type > {};
struct num_val : if_must< one< '%' >, num_val_choice > {};
struct alternation;
struct option_close : one< ']' > {};
struct option : seq< one< '[' >, pad< must< alternation >, c_wsp >, must< option_close > > {};
struct group_close : one< ')' > {};
struct group : seq< one< '(' >, pad< must< alternation >, c_wsp >, must< group_close > > {};
struct element : sor< rulename, group, option, char_val, num_val, prose_val > {};
struct repeat : sor< seq< star< DIGIT >, one< '*' >, star< DIGIT > >, plus< DIGIT > > {};
struct repetition : seq< opt< repeat >, element > {};
struct and_predicate : if_must< one< '&' >, repetition > {};
struct not_predicate : if_must< one< '!' >, repetition > {};
struct predicate : sor< and_predicate, not_predicate, repetition > {};
struct concatenation : list< predicate, plus< c_wsp > > {};
struct alternation : list_must< concatenation, pad< one< '/' >, c_wsp > > {};
struct defined_as_op : sor< string< '=', '/' >, one< '=' > > {};
struct defined_as : pad< defined_as_op, c_wsp > {};
struct rule : seq< if_must< rulename, defined_as, alternation >, star< c_wsp >, must< c_nl > > {};
struct rulelist : until< eof, sor< seq< star< c_wsp >, c_nl >, must< rule > > > {};
// end of grammar
template< typename Rule >
struct error_control : normal< Rule >
{
static const std::string error_message;
template< typename Input, typename... States >
static void raise( const Input& in, States&&... /*unused*/ )
{
throw parse_error( error_message, in );
}
};
template<> const std::string error_control< comment_cont >::error_message = "unterminated comment"; // NOLINT
template<> const std::string error_control< quoted_string_cont >::error_message = "unterminated string (missing '\"')"; // NOLINT
template<> const std::string error_control< prose_val_cont >::error_message = "unterminated prose description (missing '>')"; // NOLINT
template<> const std::string error_control< hex_val::value >::error_message = "expected hexadecimal value"; // NOLINT
template<> const std::string error_control< dec_val::value >::error_message = "expected decimal value"; // NOLINT
template<> const std::string error_control< bin_val::value >::error_message = "expected binary value"; // NOLINT
template<> const std::string error_control< num_val_choice >::error_message = "expected base specifier (one of 'bBdDxX')"; // NOLINT
template<> const std::string error_control< option_close >::error_message = "unterminated option (missing ']')"; // NOLINT
template<> const std::string error_control< group_close >::error_message = "unterminated group (missing ')')"; // NOLINT
template<> const std::string error_control< repetition >::error_message = "expected element"; // NOLINT
template<> const std::string error_control< concatenation >::error_message = "expected element"; // NOLINT
template<> const std::string error_control< alternation >::error_message = "expected element"; // NOLINT
template<> const std::string error_control< defined_as >::error_message = "expected '=' or '=/'"; // NOLINT
template<> const std::string error_control< c_nl >::error_message = "unterminated rule"; // NOLINT
template<> const std::string error_control< rule >::error_message = "expected rule"; // NOLINT
} // namespace grammar
struct node
: tao::TAO_PEGTL_NAMESPACE::parse_tree::basic_node< node >
{
template< typename... States >
void emplace_back( std::unique_ptr< node > child, States&&... st );
};
template< typename Rule > struct selector : std::false_type {};
template<> struct selector< grammar::rulename > : std::true_type {};
template<> struct selector< grammar::quoted_string > : std::true_type
{
static void transform( std::unique_ptr< node >& n )
{
shift( n->m_begin, 1 );
shift( n->m_end, -1 );
const std::string content = n->content();
for( const auto c : content ) {
if( std::isalpha( c ) != 0 ) {
n->id = &typeid( istring_tag );
return;
}
}
if( content.size() == 1 ) {
n->id = &typeid( one_tag );
}
else {
n->id = &typeid( string_tag );
}
}
};
template<> struct selector< grammar::case_sensitive_string > : std::true_type
{
static void transform( std::unique_ptr< node >& n )
{
n = std::move( n->children.back() );
if( n->content().size() == 1 ) {
n->id = &typeid( one_tag );
}
else {
n->id = &typeid( string_tag );
}
}
};
template<> struct selector< grammar::prose_val > : std::true_type {};
template<> struct selector< grammar::hex_val::value > : std::true_type {};
template<> struct selector< grammar::dec_val::value > : std::true_type {};
template<> struct selector< grammar::bin_val::value > : std::true_type {};
template<> struct selector< grammar::hex_val::range > : std::true_type {};
template<> struct selector< grammar::dec_val::range > : std::true_type {};
template<> struct selector< grammar::bin_val::range > : std::true_type {};
template<> struct selector< grammar::hex_val::type > : std::true_type {};
template<> struct selector< grammar::dec_val::type > : std::true_type {};
template<> struct selector< grammar::bin_val::type > : std::true_type {};
template<> struct selector< grammar::alternation > : parse_tree::fold_one {};
template<> struct selector< grammar::option > : std::true_type {};
template<> struct selector< grammar::group > : parse_tree::fold_one {};
template<> struct selector< grammar::repeat > : std::true_type {};
template<> struct selector< grammar::repetition > : parse_tree::fold_one {};
template<> struct selector< grammar::and_predicate > : std::true_type {};
template<> struct selector< grammar::not_predicate > : std::true_type {};
template<> struct selector< grammar::concatenation > : parse_tree::fold_one {};
template<> struct selector< grammar::defined_as_op > : std::true_type {};
template<> struct selector< grammar::rule > : std::true_type {};
// clang-format on
std::string to_string( const std::unique_ptr< node >& n );
std::string to_string( const std::vector< std::unique_ptr< node > >& v );
namespace
{
std::string get_rulename( const std::unique_ptr< node >& n )
{
assert( n->is< grammar::rulename >() );
std::string v = n->content();
std::replace( v.begin(), v.end(), '-', '_' );
return v;
}
std::string get_rulename( const std::unique_ptr< node >& n, const bool print_forward_declarations )
{
std::string v = get_rulename( n );
const auto it = find_rule( rules, v );
if( it != rules.rend() ) {
return *it;
}
if( keywords.count( v ) != 0 || v.find( "__" ) != std::string::npos ) {
throw std::runtime_error( to_string( n->begin() ) + ": '" + v + "' is a reserved rulename" ); // NOLINT
}
if( print_forward_declarations && find_rule( rules_defined, v ) != rules_defined.rend() ) {
std::cout << "struct " << v << ";\n";
}
rules.push_back( v );
return v;
}
template< typename T >
std::string gen_val( const std::unique_ptr< node >& n )
{
if( n->children.size() == 2 ) {
if( n->children.back()->is< T >() ) {
return prefix + "range< " + to_string( n->children.front() ) + ", " + to_string( n->children.back()->children.front() ) + " >";
}
}
if( n->children.size() == 1 ) {
return prefix + "one< " + to_string( n->children ) + " >";
}
return prefix + "string< " + to_string( n->children ) + " >";
}
} // namespace
template< typename... States >
void node::emplace_back( std::unique_ptr< node > child, States&&... st )
{
// inserting a rule is handled here since we need access to all previously inserted rules
if( child->is< grammar::rule >() ) {
const auto rname = get_rulename( child->children.front() );
assert( child->children.at( 1 )->is< grammar::defined_as_op >() );
const auto op = child->children.at( 1 )->content();
// when we insert a normal rule, we need to check for duplicates
if( op == "=" ) {
for( const auto& n : children ) {
if( TAO_PEGTL_STRCASECMP( rname.c_str(), abnf::get_rulename( n->children.front() ).c_str() ) == 0 ) {
throw std::runtime_error( to_string( child->begin() ) + ": rule '" + rname + "' is already defined" ); // NOLINT
}
}
}
// if it is an "incremental alternation", we need to consolidate the assigned alternations
else if( op == "=/" ) {
std::size_t i = 0;
while( i < children.size() ) {
if( TAO_PEGTL_STRCASECMP( rname.c_str(), abnf::get_rulename( children.at( i )->children.front() ).c_str() ) == 0 ) {
auto& previous = children.at( i )->children.back();
// if the previous rule does not assign an alternation, create an intermediate alternation and move its assignee into it.
if( !previous->is< abnf::grammar::alternation >() ) {
std::unique_ptr< node > s( new node );
s->id = &typeid( abnf::grammar::alternation );
s->source = previous->source;
s->m_begin = previous->m_begin;
s->m_end = previous->m_end;
s->children.emplace_back( std::move( previous ) );
previous = std::move( s );
}
// append all new options to the previous rule's assignee (which now always an alternation)
previous->m_end = child->children.back()->m_end;
// if the new rule itself contains an alternation, append the individual entries...
if( child->children.back()->is< abnf::grammar::alternation >() ) {
for( auto& n : child->children.back()->children ) {
previous->children.emplace_back( std::move( n ) );
}
}
// ...otherwise add the node itself as another option.
else {
previous->children.emplace_back( std::move( child->children.back() ) );
}
// finally, move the previous rule to the current position...
child = std::move( children.at( i ) );
// ...and remove the previous rule from the list.
children.erase( children.begin() + i );
// all OK now
break;
}
++i;
}
if( i == children.size() ) {
throw std::runtime_error( to_string( child->begin() ) + ": incremental alternation '" + rname + "' without previous rule definition" ); // NOLINT
}
}
else {
throw std::runtime_error( to_string( child->begin() ) + ": invalid operator '" + op + "', this should not happen!" ); // NOLINT
}
}
// perform the normal emplace_back operation by forwarding to the original method
tao::TAO_PEGTL_NAMESPACE::parse_tree::basic_node< node >::emplace_back( std::move( child ), st... );
}
struct stringifier
{
using function_t = std::string ( * )( const std::unique_ptr< node >& n );
function_t default_ = nullptr;
using map_t = std::map< const std::type_info*, function_t >;
map_t map_;
template< typename T >
void add( const function_t& f )
{
map_.insert( { &typeid( T ), f } );
}
std::string operator()( const std::unique_ptr< node >& n ) const
{
const auto it = map_.find( n->id );
if( it != map_.end() ) {
return it->second( n );
}
return default_( n );
}
};
stringifier make_stringifier()
{
stringifier nrv;
nrv.default_ = []( const std::unique_ptr< node >& n ) -> std::string {
throw std::runtime_error( to_string( n->begin() ) + ": missing to_string() for " + n->name() ); // NOLINT
};
nrv.add< grammar::rulename >( []( const std::unique_ptr< node >& n ) { return get_rulename( n, true ); } );
nrv.add< grammar::rule >( []( const std::unique_ptr< node >& n ) {
return "struct " + get_rulename( n->children.front(), false ) + " : " + to_string( n->children.back() ) + " {};";
} );
nrv.add< string_tag >( []( const std::unique_ptr< node >& n ) {
const std::string content = n->content();
std::string s;
for( const auto c : content ) {
append_char( s, c );
}
return prefix + "string< " + s + " >";
} );
nrv.add< istring_tag >( []( const std::unique_ptr< node >& n ) {
const std::string content = n->content();
std::string s;
for( const auto c : content ) {
append_char( s, c );
}
return prefix + "istring< " + s + " >";
} );
nrv.add< one_tag >( []( const std::unique_ptr< node >& n ) {
const std::string content = n->content();
std::string s;
for( const auto c : content ) {
append_char( s, c );
}
return prefix + "one< " + s + " >";
} );
nrv.add< grammar::hex_val::value >( []( const std::unique_ptr< node >& n ) { return "0x" + n->content(); } );
nrv.add< grammar::dec_val::value >( []( const std::unique_ptr< node >& n ) { return n->content(); } );
nrv.add< grammar::bin_val::value >( []( const std::unique_ptr< node >& n ) {
unsigned long long v = 0;
const char* p = n->m_begin.data;
// TODO: Detect overflow
do {
v <<= 1;
v |= ( *p++ & 1 );
} while( p != n->m_end.data );
std::ostringstream o;
o << v;
return o.str();
} );
nrv.add< grammar::hex_val::type >( []( const std::unique_ptr< node >& n ) { return gen_val< grammar::hex_val::range >( n ); } );
nrv.add< grammar::dec_val::type >( []( const std::unique_ptr< node >& n ) { return gen_val< grammar::dec_val::range >( n ); } );
nrv.add< grammar::bin_val::type >( []( const std::unique_ptr< node >& n ) { return gen_val< grammar::bin_val::range >( n ); } );
nrv.add< grammar::alternation >( []( const std::unique_ptr< node >& n ) { return prefix + "sor< " + to_string( n->children ) + " >"; } );
nrv.add< grammar::option >( []( const std::unique_ptr< node >& n ) { return prefix + "opt< " + to_string( n->children ) + " >"; } );
nrv.add< grammar::group >( []( const std::unique_ptr< node >& n ) { return prefix + "seq< " + to_string( n->children ) + " >"; } );
nrv.add< grammar::prose_val >( []( const std::unique_ptr< node >& n ) { return "/* " + n->content() + " */"; } );
nrv.add< grammar::and_predicate >( []( const std::unique_ptr< node >& n ) {
assert( n->children.size() == 1 );
return prefix + "at< " + to_string( n->children.front() ) + " >";
} );
nrv.add< grammar::not_predicate >( []( const std::unique_ptr< node >& n ) {
assert( n->children.size() == 1 );
return prefix + "not_at< " + to_string( n->children.front() ) + " >";
} );
nrv.add< grammar::concatenation >( []( const std::unique_ptr< node >& n ) {
assert( !n->children.empty() );
return prefix + "seq< " + to_string( n->children ) + " >";
} );
nrv.add< grammar::repetition >( []( const std::unique_ptr< node >& n ) -> std::string {
assert( n->children.size() == 2 );
const auto content = to_string( n->children.back() );
const auto rep = n->children.front()->content();
const auto star = rep.find( '*' );
if( star == std::string::npos ) {
const auto v = remove_leading_zeroes( rep );
if( v.empty() ) {
throw std::runtime_error( to_string( n->begin() ) + ": repetition of zero not allowed" ); // NOLINT
}
return prefix + "rep< " + v + ", " + content + " >";
}
const auto min = remove_leading_zeroes( rep.substr( 0, star ) );
const auto max = remove_leading_zeroes( rep.substr( star + 1 ) );
if( ( star != rep.size() - 1 ) && max.empty() ) {
throw std::runtime_error( to_string( n->begin() ) + ": repetition maximum of zero not allowed" ); // NOLINT
}
if( min.empty() && max.empty() ) {
return prefix + "star< " + content + " >";
}
if( !min.empty() && max.empty() ) {
if( min == "1" ) {
return prefix + "plus< " + content + " >";
}
return prefix + "rep_min< " + min + ", " + content + " >";
}
if( min.empty() && !max.empty() ) {
if( max == "1" ) {
return prefix + "opt< " + content + " >";
}
return prefix + "rep_max< " + max + ", " + content + " >";
}
unsigned long long min_val;
unsigned long long max_val;
{
std::stringstream s;
s.str( min );
s >> min_val;
s.clear();
s.str( max );
s >> max_val;
}
if( min_val > max_val ) {
throw std::runtime_error( to_string( n->begin() ) + ": repetition minimum which is greater than the repetition maximum not allowed" ); // NOLINT
}
const auto min_element = ( min_val == 1 ) ? content : ( prefix + "rep< " + min + ", " + content + " >" );
if( min_val == max_val ) {
return min_element;
}
std::ostringstream os;
os << ( max_val - min_val );
const auto max_element = prefix + ( ( max_val - min_val == 1 ) ? "opt< " : ( "rep_opt< " + os.str() + ", " ) ) + content + " >";
return prefix + "seq< " + min_element + ", " + max_element + " >";
} );
return nrv;
}
std::string to_string( const std::unique_ptr< node >& n )
{
static stringifier s = make_stringifier();
return s( n );
}
std::string to_string( const std::vector< std::unique_ptr< node > >& v )
{
std::string result;
for( const auto& c : v ) {
if( !result.empty() ) {
result += ", ";
}
result += to_string( c );
}
return result;
}
} // namespace abnf
} // namespace TAO_PEGTL_NAMESPACE
} // namespace tao
int main( int argc, char** argv )
{
using namespace tao::TAO_PEGTL_NAMESPACE; // NOLINT
if( argc != 2 ) {
analyze< abnf::grammar::rulelist >();
std::cerr << "Usage: " << argv[ 0 ] << " SOURCE" << std::endl;
return 1;
}
file_input<> in( argv[ 1 ] );
try {
const auto root = parse_tree::parse< abnf::grammar::rulelist, abnf::node, abnf::selector, abnf::grammar::error_control >( in );
for( const auto& rule : root->children ) {
abnf::rules_defined.push_back( abnf::get_rulename( rule->children.front() ) );
}
for( const auto& rule : root->children ) {
std::cout << abnf::to_string( rule ) << std::endl;
}
}
catch( const parse_error& e ) {
const auto p = e.positions.front();
std::cerr << e.what() << std::endl
<< in.line_as_string( p ) << std::endl
<< std::string( p.byte_in_line, ' ' ) << '^' << std::endl;
}
return 0;
}