From 725b9889e0cafbff4288296b969123968a3ec90c Mon Sep 17 00:00:00 2001 From: Keith Winstein Date: Wed, 19 Jan 2011 23:11:49 -0500 Subject: [PATCH] Fix UTF-8 decoding routine to comply with Unicode standard --- parser.cpp | 83 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/parser.cpp b/parser.cpp index 1b51bd0..b4e3811 100644 --- a/parser.cpp +++ b/parser.cpp @@ -52,42 +52,69 @@ std::vector Parser::UTF8Parser::input( char c ) buf[ buf_len++ ] = c; /* This function will only work in a UTF-8 locale. */ - /* This must be asserted by other code. */ + /* This is asserted in the constructor. */ wchar_t pwc; mbstate_t ps; memset( &ps, 0, sizeof( ps ) ); - size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps ); + size_t total_bytes_parsed = 0; + size_t orig_buf_len = buf_len; + std::vector ret; - /* this returns 0 when n = 0! */ + /* this routine is somewhat complicated in order to comply with + Unicode 6.0, section 3.9, "Best Practices for using U+FFFD" */ - /* This function annoying returns a size_t so we have to check - the negative values first before the "> 0" branch */ + while ( total_bytes_parsed != orig_buf_len ) { + assert( total_bytes_parsed < orig_buf_len ); + assert( buf_len > 0 ); + size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps ); - if ( bytes_parsed == 0 ) { - /* character was NUL, accept and clear buffer */ - assert( buf_len == 1 ); - buf_len = 0; - pwc = L'\0'; - } else if ( bytes_parsed == (size_t) -1 ) { - /* invalid sequence, use replacement character and clear buffer */ - assert( errno == EILSEQ ); - buf_len = 0; - pwc = (wchar_t) 0xFFFD; - } else if ( bytes_parsed == (size_t) -2 ) { - /* can't parse complete multibyte character */ - /* return empty vector */ - std::vector vec; - return vec; - } else if ( bytes_parsed > 0 ) { - /* parsed into pwc, accept and clear buffer */ - assert( bytes_parsed == buf_len ); - buf_len = 0; - } else { - throw std::string( "Unknown return value from mbrtowc" ); + /* this returns 0 when n = 0! */ + + /* This function annoying returns a size_t so we have to check + the negative values first before the "> 0" branch */ + + if ( bytes_parsed == 0 ) { + /* character was NUL, accept and clear buffer */ + assert( buf_len == 1 ); + buf_len = 0; + pwc = L'\0'; + bytes_parsed = 1; + } else if ( bytes_parsed == (size_t) -1 ) { + /* invalid sequence, use replacement character and try again with last char */ + assert( errno == EILSEQ ); + if ( buf_len > 1 ) { + buf[ 0 ] = buf[ buf_len - 1 ]; + bytes_parsed = buf_len - 1; + buf_len = 1; + } else { + buf_len = 0; + bytes_parsed = 1; + } + pwc = (wchar_t) 0xFFFD; + } else if ( bytes_parsed == (size_t) -2 ) { + /* can't parse incomplete multibyte character */ + total_bytes_parsed += buf_len; + continue; + } else if ( bytes_parsed > 0 ) { + /* parsed into pwc, accept */ + assert( bytes_parsed <= buf_len ); + memcpy( buf, buf + bytes_parsed, buf_len - bytes_parsed ); + buf_len = buf_len - bytes_parsed; + } else { + throw std::string( "Unknown return value from mbrtowc" ); + } + + std::vector vec = parser.input( pwc ); + for ( std::vector::iterator i = vec.begin(); + i != vec.end(); + i++ ) { + ret.push_back( *i ); + } + + total_bytes_parsed += bytes_parsed; } - /* we parsed character into pwc */ - return parser.input( pwc ); + return ret; }