[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

c parser backend idea



I've been toying around with an idea about a c backend for sablecc altgen.
It would be quite easy to do, only for what features and attitude to add,
as c is clearly not an OO language.

See attached main.xss and its generated public interface.
What do you think?

Primary goals:
 - small code size
 - small memory footprint
 - fast

Just provide an abstract syntax tree, with user defined properties attached
to the structure (but no parent link).

Leave lots of dirty work to programmers. Several options to finetune
memory-management for different conditions (but not affecting API too
much to make incompatible).

Unicode/UTF8 support.

No namespacing/module support in C. So the only option is to use prefixing.

Regards,
Indrek
$ param dest_dir = '.', target_build

$ param package = @package
$ param basedir = $dest_dir, prefix = {translate($package, '.', '_')} + '_'
$ param libname = {translate($package, '.', '_')}
$ param incdir = $basedir + '/' + $libname

$ // One can override basic memory allocation functions
$ param malloc = 'malloc'
$ param free = 'free'
$ param realloc = 'realloc'

$ param no_long_errors = '' //|YES, don't generate long error message strings

$ // tokens by default are encoded with utf8, can be truncated to latin1
$ param token_encoding = 'utf8' //|latin1

$ // "token cache" for token memory savings in most cases (but slowness)
$ // "fast token alloc" for fast token allocation at parser
$ // (not memory saving, just append data without frees)
$ param token_alloc = 'malloc' // |cached|fast
$ // "pooled node alloc" for fast and pooled node allocations at parser
$ param node_alloc = 'pooled' // |malloc
$ param inline = '' //|YES - replace some functions with static inline

$ param prefix_macro = '', prefix_type = '', prefix_function = '', prefix_nodetype = 'n'

$ output $dest_dir + '/parser.h'
#ifndef __parser_h__
#define __parser_h__

#include <stdio.h>

enum ${$prefix_type}node_type
{
$ foreach  {//token}
  ${$prefix_nodetype}@ename,
$ end foreach
  ${$prefix_nodetype}EOF,
$ foreach {//alt}
  ${$prefix_nodetype}@ename,
$ end foreach
};

#define ${$prefix_macro}FIRST_TOKEN ${$prefix_nodetype}${//token/@ename}
#define ${$prefix_macro}LAST_TOKEN ${$prefix_nodetype}EOF
#define ${$prefix_macro}FIRST_PRODUCTION ${$prefix_nodetype}${//alt/@ename}
#define ${$prefix_macro}LAST_PRODUCTION ${$prefix_nodetype}${//alt[position()=count(//alt)]/@ename}

struct ${$prefix_type}node;

struct ${$prefix_type}node
{
  enum ${$prefix_type}node_type type;
  union {
    struct {
      const char *value;
      int len;
    } token;
    struct ${$prefix_type}node *node_array[0];
$ foreach {//alt}
    struct {
$   foreach {elem}
$     if @is_list
      struct {
        struct ${$prefix_type}node *first;
      } l@ename;
$     else
      struct ${$prefix_type}node *@ename;
$     end if
$   end foreach
    } @ename;
$ end foreach
  } c;
  struct ${$prefix_type}node *next;    /* for a list */
  struct
  {
$user_properties
  } props;
};

#define ${$prefix_macro}NODE_TYPE(n) ((n)->type)
#define ${$prefix_macro}NODE_IS_TOKEN(n) (${$prefix_macro}NODE_TYPE(n) < ${$prefix_macro}FIRST_PRODUCTION)
#define ${$prefix_macro}NODE_IS_PRODUCTION(n) (${$prefix_macro}NODE_TYPE(n) >= ${$prefix_macro}FIRST_PRODUCTION)

struct ${$prefix_type}parser;
struct ${$prefix_type}lexer;

enum ${$prefix_type}input_encoding
{
  LATIN1,
  UTF8,
};

struct ${$prefix_type}lexer * ${$prefix_function}lexer_create_from_path(const char *path, enum ${$prefix_type}input_encoding encoding);
struct ${$prefix_type}lexer * ${$prefix_function}lexer_create_from_file(FILE *in, enum ${$prefix_type}input_encoding encoding);
struct ${$prefix_type}lexer * ${$prefix_function}lexer_create_from_string(const char *str, enum ${$prefix_type}input_encoding encoding);

typedef ${$prefix_type}struct node * (*${$prefix_type}lexer_get_t)(void *, struct ${$prefix_type}parser *);
struct ${$prefix_type}node *${$prefix_function}lexer_get (struct ${$prefix_type}lexer *lexer, struct ${$prefix_type}parser *for_parser);
void ${$prefix_function}lexer_destroy (struct ${$prefix_type}lexer *lexer);

extern struct ${$prefix_type}parser *${$prefix_function}parser_create();
extern struct ${$prefix_type}node *${$prefix_function}parser_parse (lexer_get_t get_fn, void *lexer_data);
extern void ${$prefix_function}parser_destroy (struct ${$prefix_type}parser *parser);

extern struct ${$prefix_type}node *${$prefix_function}node_alloc (struct ${$prefix_type}parser *parser);
extern struct ${$prefix_type}node *${$prefix_function}node_alloc_token (struct ${$prefix_type}parser *parser, enum ${$prefix_type}node_type type, const char *value, int len);
extern struct ${$prefix_type}node *${$prefix_function}node_alloc_dirty (struct ${$prefix_type}parser *parser);
extern struct ${$prefix_type}node *${$prefix_function}node_clone (struct ${$prefix_type}parser *parser, struct ${$prefix_type}node *what);

extern void ${$prefix_function}node_add_list (struct ${$prefix_type}node **list, struct ${$prefix_type}node *node);
extern void ${$prefix_function}node_rem_list (struct ${$prefix_type}node **list, struct ${$prefix_type}node *node);

extern void ${$prefix_function}node_free (struct ${$prefix_type}parser *parser, struct ${$prefix_type}node *node);

/* use this for building tree walkers */
#define ${$prefix_macro}PRODUCTION_ITERATE(n,expression) \
{\
  struct ${$prefix_type}node **i; \
  i = n->c.node_array; \
  while ( i != &n->next ) { \
    struct ${$prefix_type}node *iterator = *i; \
    if ( !iterator ) break; \
    do { \
      { expression; } \
      iterator = iterator->next; \
    } while (iterator); \
    i++; \
  } \
}

/* safer variant of previous function */
#define ${$prefix_macro}NODE_ITERATE(n,expression) \
  if (${$prefix_macro}NODE_IS_PRODUCTION(n)) ${$prefix_macro}PRODUCTION_ITERATE(n,expression)

/* this macro is for debug, can bloat code so if one wants to make
   industrial strength use of it wrap in a function first */
#define ${$prefix_macro}NODE2NAME(node) \
({\
  const char *n; \
  switch (${$prefix_macro}NODE_TYPE(node)) { \
$ foreach  {//token}
    case ${$prefix_nodetype}@ename: n = "@ename"; break; \
$ end foreach
    case ${$prefix_nodetype}EOF: n = "EOF"; break; \
$ foreach {//alt}
    case ${$prefix_nodetype}@ename: n = "@ename"; break; \
$ end foreach
    default: n = "(unknown node type)"; break; \
  } \
  n;\
})

#endif /* !__parser_h__ */
$ end output

#ifndef __parser_h__
#define __parser_h__

#include <stdio.h>

enum node_type
{
  nTLPar,
  nTRPar,
  nTPlus,
  nTMinus,
  nTMult,
  nTDiv,
  nTSemi,
  nTBlank,
  nTNumber,
  nTOne,
  nTTwo,
  nTThree,
  nTRandom,
  nEOF,
  nAGrammar,
  nAPlusExp,
  nAMinusExp,
  nADivExp,
  nAMultExp,
  nATextualExp,
  nARandomX2Exp,
  nANumberExp,
  nAT1Textual,
  nAT2Textual,
  nAT3Textual,
};

#define FIRST_TOKEN nTLPar
#define LAST_TOKEN nEOF
#define FIRST_PRODUCTION nAGrammar
#define LAST_PRODUCTION nAT3Textual

struct node;

struct node
{
  enum node_type type;
  union {
    struct {
      const char *value;
      int len;
    } token;
    struct node *node_array[0];
    struct {
      struct {
        struct node *first;
      } lExp;
    } AGrammar;
    struct {
      struct node *L;
      struct node *R;
    } APlusExp;
    struct {
      struct node *L;
      struct node *R;
    } AMinusExp;
    struct {
      struct node *L;
      struct node *R;
    } ADivExp;
    struct {
      struct node *L;
      struct node *R;
    } AMultExp;
    struct {
      struct {
        struct node *first;
      } lTextual;
    } ATextualExp;
    struct {
      struct node *R1;
      struct node *R2;
    } ARandomX2Exp;
    struct {
      struct node *Number;
    } ANumberExp;
    struct {
      struct node *One;
    } AT1Textual;
    struct {
      struct node *Two;
    } AT2Textual;
    struct {
      struct node *Three;
    } AT3Textual;
  } c;
  struct node *next;    /* for a list */
  struct
  {

  } props;
};

#define NODE_TYPE(n) ((n)->type)
#define NODE_IS_TOKEN(n) (NODE_TYPE(n) < FIRST_PRODUCTION)
#define NODE_IS_PRODUCTION(n) (NODE_TYPE(n) >= FIRST_PRODUCTION)

struct parser;
struct lexer;

enum input_encoding
{
  LATIN1,
  UTF8,
};

struct lexer * lexer_create_from_path(const char *path, enum input_encoding encoding);
struct lexer * lexer_create_from_file(FILE *in, enum input_encoding encoding);
struct lexer * lexer_create_from_string(const char *str, enum input_encoding encoding);

typedef struct node * (*lexer_get_t)(void *, struct parser *);
struct node *lexer_get (struct lexer *lexer, struct parser *for_parser);
void lexer_destroy (struct lexer *lexer);

extern struct parser *parser_create();
extern struct node *parser_parse (lexer_get_t get_fn, void *lexer_data);
extern void parser_destroy (struct parser *parser);

extern struct node *node_alloc (struct parser *parser);
extern struct node *node_alloc_token (struct parser *parser, enum node_type type, const char *value, int len);
extern struct node *node_alloc_dirty (struct parser *parser);
extern struct node *node_clone (struct parser *parser, struct node *what);

extern void node_add_list (struct node **list, struct node *node);
extern void node_rem_list (struct node **list, struct node *node);

extern void node_free (struct parser *parser, struct node *node);

/* use this for building tree walkers */
#define PRODUCTION_ITERATE(n,expression) \
{\
  struct node **i; \
  i = n->c.node_array; \
  while ( i != &n->next ) { \
    struct node *iterator = *i; \
    if ( !iterator ) break; \
    do { \
      { expression; } \
      iterator = iterator->next; \
    } while (iterator); \
    i++; \
  } \
}

/* safer variant of previous function */
#define NODE_ITERATE(n,expression) \
  if (NODE_IS_PRODUCTION(n)) PRODUCTION_ITERATE(n,expression)

/* this macro is for debug, can bloat code so if one wants to make
   industrial strength use of it wrap in a function first */
#define NODE2NAME(node) \
({\
  const char *n; \
  switch (NODE_TYPE(node)) { \
    case nTLPar: n = "TLPar"; break; \
    case nTRPar: n = "TRPar"; break; \
    case nTPlus: n = "TPlus"; break; \
    case nTMinus: n = "TMinus"; break; \
    case nTMult: n = "TMult"; break; \
    case nTDiv: n = "TDiv"; break; \
    case nTSemi: n = "TSemi"; break; \
    case nTBlank: n = "TBlank"; break; \
    case nTNumber: n = "TNumber"; break; \
    case nTOne: n = "TOne"; break; \
    case nTTwo: n = "TTwo"; break; \
    case nTThree: n = "TThree"; break; \
    case nTRandom: n = "TRandom"; break; \
    case nEOF: n = "EOF"; break; \
    case nAGrammar: n = "AGrammar"; break; \
    case nAPlusExp: n = "APlusExp"; break; \
    case nAMinusExp: n = "AMinusExp"; break; \
    case nADivExp: n = "ADivExp"; break; \
    case nAMultExp: n = "AMultExp"; break; \
    case nATextualExp: n = "ATextualExp"; break; \
    case nARandomX2Exp: n = "ARandomX2Exp"; break; \
    case nANumberExp: n = "ANumberExp"; break; \
    case nAT1Textual: n = "AT1Textual"; break; \
    case nAT2Textual: n = "AT2Textual"; break; \
    case nAT3Textual: n = "AT3Textual"; break; \
    default: n = "(unknown node type)"; break; \
  } \
  n;\
})

#endif /* !__parser_h__ */