Separate modules by subdirectory

2012-02-06 18:26:45 -05:00
parent 7e56af8fcd
commit 38c9e99882
58 changed files with 79 additions and 16 deletions
@@ -0,0 +1,127 @@
+#include <assert.h>
+#include <typeinfo>
+#include <langinfo.h>
+
+#include "parser.h"
+
+static void append_or_delete( Parser::Action *act,
+			      std::list<Parser::Action *>&vec )
+{
+  assert( act );
+
+  if ( typeid( *act ) != typeid( Parser::Ignore ) ) {
+    vec.push_back( act );
+  } else {
+    delete act;
+  }
+}
+
+std::list<Parser::Action *> Parser::Parser::input( wchar_t ch )
+{
+  std::list<Action *> ret;
+
+  Transition tx = state->input( ch );
+
+  if ( tx.next_state != NULL ) {
+    append_or_delete( state->exit(), ret );
+  }
+
+  append_or_delete( tx.action, ret );
+
+  if ( tx.next_state != NULL ) {
+    append_or_delete( tx.next_state->enter(), ret );
+    state = tx.next_state;
+  }
+
+  return ret;
+}
+
+Parser::UTF8Parser::UTF8Parser()
+  : parser(), buf_len( 0 )
+{
+  assert( BUF_SIZE >= MB_CUR_MAX );
+}
+
+std::list<Parser::Action *> Parser::UTF8Parser::input( char c )
+{
+  assert( buf_len < BUF_SIZE );
+
+  buf[ buf_len++ ] = c;
+
+  /* This function will only work in a UTF-8 locale. */
+  /* This is asserted in the constructor. */
+
+  wchar_t pwc;
+  mbstate_t ps;
+  memset( &ps, 0, sizeof( ps ) );
+
+  size_t total_bytes_parsed = 0;
+  size_t orig_buf_len = buf_len;
+  std::list<Action *> ret;
+
+  /* this routine is somewhat complicated in order to comply with
+     Unicode 6.0, section 3.9, "Best Practices for using U+FFFD" */
+
+  while ( total_bytes_parsed != orig_buf_len ) {
+    assert( total_bytes_parsed < orig_buf_len );
+    assert( buf_len > 0 );
+    size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps );
+
+    /* this returns 0 when n = 0! */
+
+    /* This function annoying returns a size_t so we have to check
+       the negative values first before the "> 0" branch */
+
+    if ( bytes_parsed == 0 ) {
+      /* character was NUL, accept and clear buffer */
+      assert( buf_len == 1 );
+      buf_len = 0;
+      pwc = L'\0';
+      bytes_parsed = 1;
+    } else if ( bytes_parsed == (size_t) -1 ) {
+      /* invalid sequence, use replacement character and try again with last char */
+      assert( errno == EILSEQ );
+      if ( buf_len > 1 ) {
+	buf[ 0 ] = buf[ buf_len - 1 ];
+	bytes_parsed = buf_len - 1;
+	buf_len = 1;
+      } else {
+	buf_len = 0;
+	bytes_parsed = 1;
+      }
+      pwc = (wchar_t) 0xFFFD;
+    } else if ( bytes_parsed == (size_t) -2 ) {
+      /* can't parse incomplete multibyte character */
+      total_bytes_parsed += buf_len;
+      continue;
+    } else if ( bytes_parsed > 0 ) {
+      /* parsed into pwc, accept */
+      assert( bytes_parsed <= buf_len );
+      memcpy( buf, buf + bytes_parsed, buf_len - bytes_parsed );
+      buf_len = buf_len - bytes_parsed;
+    } else {
+      throw std::string( "Unknown return value from mbrtowc" );
+    }
+
+    if ( (pwc < 0) || (pwc > 0x10FFFF) ) { /* outside Unicode range */
+      pwc = (wchar_t) 0xFFFD;
+    }
+
+    std::list<Action *> vec = parser.input( pwc );
+    ret.insert( ret.end(), vec.begin(), vec.end() );
+
+    total_bytes_parsed += bytes_parsed;
+  }
+
+  return ret;
+}
+
+Parser::Parser::Parser( const Parser &other )
+  : state( other.state )
+{}
+
+Parser::Parser & Parser::Parser::operator=( const Parser &other )
+{
+  state = other.state;
+  return *this;
+}