Move UTF-8 code inside class

2011-01-14 01:21:58 -05:00
parent 32db675dd2
commit 4065e667b7
4 changed files with 86 additions and 88 deletions
@@ -3,7 +3,7 @@ objects = parserstate.o parser.o templates.o
 executables = parse
 CXX = g++
-CXXFLAGS = -g --std=c++0x -pedantic -Werror -Wall -Wextra -Weffc++ -fno-implicit-templates -pipe -D_FILE_OFFSET_BITS=64 -D_XOPEN_SOURCE=500 -D_GNU_SOURCE
+CXXFLAGS = -g --std=c++0x -pedantic -Werror -Wall -Wextra -Weffc++ -fno-implicit-templates -fno-default-inline -pipe -D_FILE_OFFSET_BITS=64 -D_XOPEN_SOURCE=500 -D_GNU_SOURCE
 LIBS = -lutil
 all: $(executables)
@@ -22,21 +22,9 @@
 const size_t buf_size = 1024;
 class stripstate {
 public:
  int src_fd, dest_fd;
  mbstate_t ps;
  char buf[ buf_size ];
  size_t buf_len;
  Parser::Parser parser;
  stripstate() : src_fd(-1), dest_fd(-1), ps(),
 		 buf(), buf_len(0), parser() {}
 };
 void emulate_terminal( int fd );
 int copy( int src, int dest );
-int vt_parser( struct stripstate *state );
+int vt_parser( int fd, Parser::UTF8Parser *parser );
 int main( int argc __attribute__((unused)),
 	  char *argv[] __attribute__((unused)),
@@ -87,13 +75,6 @@ int main( int argc __attribute__((unused)),
    my_argv[ 1 ] = NULL;
    /*
    if ( setenv( "TERM", "vt220", true ) < 0 ) {
      perror( "setenv" );
      exit( 1 );
    }
    */
    if ( execve( "/bin/bash", my_argv, envp ) < 0 ) {
      perror( "execve" );
      exit( 1 );
@@ -123,7 +104,7 @@ int main( int argc __attribute__((unused)),
 void emulate_terminal( int fd )
 {
-  struct stripstate output_stripstate;
+  Parser::UTF8Parser parser;
  struct pollfd pollfds[ 2 ];
  pollfds[ 0 ].fd = STDIN_FILENO;
@@ -132,11 +113,6 @@ void emulate_terminal( int fd )
  pollfds[ 1 ].fd = fd;
  pollfds[ 1 ].events = POLLIN;
  output_stripstate.src_fd = fd;
  output_stripstate.dest_fd = STDOUT_FILENO;
  output_stripstate.buf_len = 0;
  memset( &output_stripstate.ps, 0, sizeof( output_stripstate.ps ) );
  while ( 1 ) {
    int active_fds = poll( pollfds, 2, -1 );
    if ( active_fds <= 0 ) {
@@ -149,7 +125,7 @@ void emulate_terminal( int fd )
 	return;
      }
    } else if ( pollfds[ 1 ].revents & POLLIN ) {
-      if ( vt_parser( &output_stripstate ) < 0 ) {
+      if ( vt_parser( fd, &parser ) < 0 ) {
 	return;
      }
    } else if ( (pollfds[ 0 ].revents | pollfds[ 1 ].revents)
@@ -188,11 +164,12 @@ int copy( int src, int dest )
  return 0;
 }
-int vt_parser( struct stripstate *state )
+int vt_parser( int fd, Parser::UTF8Parser *parser )
 {
  char buf[ buf_size ];
  /* fill buffer if possible */
-  ssize_t bytes_read = read( state->src_fd, state->buf + state->buf_len,
+  ssize_t bytes_read = read( fd, buf, buf_size );
 			     buf_size - state->buf_len );
  if ( bytes_read == 0 ) { /* EOF */
    return -1;
  } else if ( bytes_read < 0 ) {
@@ -200,61 +177,9 @@ int vt_parser( struct stripstate *state )
    return -1;
  }
-  state->buf_len += bytes_read;
+  /* feed to parser */
-
+  for ( int i = 0; i < bytes_read; i++ ) {
-  /* translate buffer from UTF-8 to wide characters */
+    std::vector<Parser::Action *> actions = parser->input( buf[ i ] );
  wchar_t out_buffer[ buf_size ];
  size_t in_index = 0, out_index = 0;
  while ( 1 ) {
    assert( in_index <= state->buf_len );
    if ( in_index == state->buf_len ) {
      state->buf_len = 0;
      break;
    }
    wchar_t pwc;
    size_t bytes_parsed = mbrtowc( &pwc, state->buf + in_index,
 				   state->buf_len - in_index,
 				   &state->ps );
    /* this returns 0 when n = 0! */
    /* This function annoying returns a size_t so we have to check
       the negative values first before the "> 0" branch */
    if ( bytes_parsed == 0 ) {
      /* character was NUL */
      in_index++; /* this relies on knowing UTF-8 NUL is one byte! */
      assert( out_index < buf_size );
      out_buffer[ out_index++ ] = L'\0';
    } else if ( bytes_parsed == (size_t) -1 ) {
      /* invalid sequence */
      assert( errno == EILSEQ );
      in_index++;
      assert( out_index < buf_size );
      out_buffer[ out_index++ ] = (wchar_t) 0xFFFD;
      memset( &state->ps, 0, sizeof( state->ps ) );
    } else if ( bytes_parsed == (size_t) -2 ) {
      /* can't parse complete multibyte character */
      memmove( state->buf, state->buf + in_index,
 	       state->buf_len - in_index );
      state->buf_len = state->buf_len - in_index;
      break;
    } else if ( bytes_parsed > 0 ) {
      /* parsed something */
      in_index += bytes_parsed;
      assert( out_index < buf_size );
      out_buffer[ out_index++ ] = pwc;
    } else {
      fprintf( stderr, "Unknown return value %d from mbrtowc\n",
 	       bytes_parsed );
      exit( 1 );
    }
  }
  /* feed to vtparse */
  for ( size_t i = 0; i < out_index; i++ ) {
    std::vector<Parser::Action *> actions = state->parser.input( out_buffer[ i ] );
    for ( std::vector<Parser::Action *>::iterator j = actions.begin();
 	  j != actions.end();
 	  j++ ) {
@@ -263,7 +188,7 @@ int vt_parser( struct stripstate *state )
      assert( act );
      if ( act->char_present ) {
-	printf( "%s(0x%02x) ", act->name().c_str(), act->ch );
+	printf( "%s(0x%02x=%lc) ", act->name().c_str(), act->ch, act->ch );
      } else {
 	printf( "[%s] ", act->name().c_str() );
      }
@@ -1,5 +1,6 @@
 #include <assert.h>
 #include <typeinfo>
 #include <langinfo.h>
 #include "parser.hpp"
@@ -34,3 +35,59 @@ std::vector<Parser::Action *> Parser::Parser::input( wchar_t ch )
  return ret;
 }
 Parser::UTF8Parser::UTF8Parser()
  : parser(), buf_len( 0 )
 {
  if ( strcmp( nl_langinfo( CODESET ), "UTF-8" ) != 0 ) {
    fprintf( stderr, "rtm requires a UTF-8 locale.\n" );
    throw std::string( "rtm requires a UTF-8 locale." );
  }
 }
 std::vector<Parser::Action *> Parser::UTF8Parser::input( char c )
 {
  assert( buf_len < BUF_SIZE );
  buf[ buf_len++ ] = c;
  /* This function will only work in a UTF-8 locale. */
  /* This must be asserted by other code. */
  wchar_t pwc;
  mbstate_t ps;
  memset( &ps, 0, sizeof( ps ) );
  size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps );
  /* this returns 0 when n = 0! */
  /* This function annoying returns a size_t so we have to check
     the negative values first before the "> 0" branch */
  if ( bytes_parsed == 0 ) {
    /* character was NUL, accept and clear buffer */
    assert( buf_len == 1 );
    buf_len = 0;
    pwc = L'\0';
  } else if ( bytes_parsed == (size_t) -1 ) {
    /* invalid sequence, use replacement character and clear buffer */
    assert( errno == EILSEQ );
    buf_len = 0;
    pwc = (wchar_t) 0xFFFD;
  } else if ( bytes_parsed == (size_t) -2 ) {
    /* can't parse complete multibyte character */
    /* return empty vector */
    std::vector<Action *> vec;
    return vec;
  } else if ( bytes_parsed > 0 ) {
    /* parsed into pwc, accept and clear buffer */
    assert( bytes_parsed == buf_len );
    buf_len = 0;
  } else {
    throw std::string( "Unknown return value from mbrtowc" );
  }
  /* we parsed character into pwc */
  return parser.input( pwc );
 }
@@ -6,6 +6,7 @@
 #include <wchar.h>
 #include <vector>
 #include <string.h>
 #include "parsertransition.hpp"
 #include "parseraction.hpp"
@@ -25,7 +26,22 @@ namespace Parser {
    bool operator=( const Parser & );
    ~Parser() {}
-    std::vector<Action *> input( wchar_t c );
+    std::vector<Action *> input( wchar_t ch );
  };
  static const size_t BUF_SIZE = 8;
  class UTF8Parser {
  private:
    Parser parser;
    char buf[ BUF_SIZE ];
    size_t buf_len;
  public:
    UTF8Parser();
    std::vector<Action *> input( char c );
  };
 }