Fix UTF-8 decoding routine to comply with Unicode standard

This commit is contained in:
Keith Winstein
2011-01-19 23:11:49 -05:00
parent 4065e667b7
commit 725b9889e0
+38 -11
View File
@@ -52,12 +52,22 @@ std::vector<Parser::Action *> Parser::UTF8Parser::input( char c )
buf[ buf_len++ ] = c; buf[ buf_len++ ] = c;
/* This function will only work in a UTF-8 locale. */ /* This function will only work in a UTF-8 locale. */
/* This must be asserted by other code. */ /* This is asserted in the constructor. */
wchar_t pwc; wchar_t pwc;
mbstate_t ps; mbstate_t ps;
memset( &ps, 0, sizeof( ps ) ); memset( &ps, 0, sizeof( ps ) );
size_t total_bytes_parsed = 0;
size_t orig_buf_len = buf_len;
std::vector<Action *> ret;
/* this routine is somewhat complicated in order to comply with
Unicode 6.0, section 3.9, "Best Practices for using U+FFFD" */
while ( total_bytes_parsed != orig_buf_len ) {
assert( total_bytes_parsed < orig_buf_len );
assert( buf_len > 0 );
size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps ); size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps );
/* this returns 0 when n = 0! */ /* this returns 0 when n = 0! */
@@ -70,24 +80,41 @@ std::vector<Parser::Action *> Parser::UTF8Parser::input( char c )
assert( buf_len == 1 ); assert( buf_len == 1 );
buf_len = 0; buf_len = 0;
pwc = L'\0'; pwc = L'\0';
bytes_parsed = 1;
} else if ( bytes_parsed == (size_t) -1 ) { } else if ( bytes_parsed == (size_t) -1 ) {
/* invalid sequence, use replacement character and clear buffer */ /* invalid sequence, use replacement character and try again with last char */
assert( errno == EILSEQ ); assert( errno == EILSEQ );
if ( buf_len > 1 ) {
buf[ 0 ] = buf[ buf_len - 1 ];
bytes_parsed = buf_len - 1;
buf_len = 1;
} else {
buf_len = 0; buf_len = 0;
bytes_parsed = 1;
}
pwc = (wchar_t) 0xFFFD; pwc = (wchar_t) 0xFFFD;
} else if ( bytes_parsed == (size_t) -2 ) { } else if ( bytes_parsed == (size_t) -2 ) {
/* can't parse complete multibyte character */ /* can't parse incomplete multibyte character */
/* return empty vector */ total_bytes_parsed += buf_len;
std::vector<Action *> vec; continue;
return vec;
} else if ( bytes_parsed > 0 ) { } else if ( bytes_parsed > 0 ) {
/* parsed into pwc, accept and clear buffer */ /* parsed into pwc, accept */
assert( bytes_parsed == buf_len ); assert( bytes_parsed <= buf_len );
buf_len = 0; memcpy( buf, buf + bytes_parsed, buf_len - bytes_parsed );
buf_len = buf_len - bytes_parsed;
} else { } else {
throw std::string( "Unknown return value from mbrtowc" ); throw std::string( "Unknown return value from mbrtowc" );
} }
/* we parsed character into pwc */ std::vector<Action *> vec = parser.input( pwc );
return parser.input( pwc ); for ( std::vector<Action *>::iterator i = vec.begin();
i != vec.end();
i++ ) {
ret.push_back( *i );
}
total_bytes_parsed += bytes_parsed;
}
return ret;
} }