|
Sun Mon Tue Wed Thu Fri Sat 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
Leaves
Wednesday, March 25, 2015
Thursday, March 26, 2015
filter UTF8ToUnicodeDecoder(byte, UnicodeChar)
{
byte v = consume;
if (value < 0x80) {
emit UnicodeChar(v);
}
else
{
int _code_point;
int _following;
if ((value & 0xFC) == 0xF8) {
_code_point = v & 0x07;
_following = 4;
}
else if ((value & 0xF8) == 0xF0) {
_code_point = v & 0x0F;
_following = 3;
}
else if ((value & 0xF0) == 0xE0) {
_code_point = v & 0x1F;
_following = 2;
}
else if ((value & 0xE0) == 0xC0) {
_code_point = v & 0x3F;
_following = 1;
}
else {
return; // error: not a valid start byte
}
for (; _following > 0; _following--)
{
v = consume;
if ((v & 0xC0) != 0x80) {
return; // error: not a valid following byte
}
_code_point = (codepoint << 6) | (v & 0x7F);
}
emit UnicodeChar(_code_point);
}
};
I have not thought yet about syntax constructs for using the filter. The language should at least have an iterator concept (for the pull algorithm), a stream concept (for the push algorithm), and/or have pipe statements like in most Unix shells. In the above filter specification there are also a number of error conditions that can occur. Because there are many ways with dealing with errors, the error handling might depend on the context in which the filter is being used. Ideally, one would have some mechanism to specify the error handling at the place where the filter is used. I recently, implemented the push algorithm in a C++ class in the following manner, when working on new version of IParse:
class UTF8ToUnicodeConverterStream : public ConverterStream<char, UnicodeChar>
{
public:
UTF8ToUnicodeConverterStream() : _following(0) {}
virtual void emit(const char symbol)
{
unsigned char value = (unsigned char)symbol;
if ((value & 0xC0) == 0x80) {
if (_following == 0) {
// error: out of place following char
return;
}
_code_point = (_code_point << 6) | (value & 0x3F);
if (--_following == 0)
_out->emit(_code_point);
return;
}
if (_following > 0) {
// error: expecting more following chars
_following = 0;
return;
}
if ((value & 0x80) == 0x00) {
_out->emit(value);
}
else if ((value & 0xE0) == 0xC0) {
_code_point = value & 0x1F;
_following = 1;
}
else if ((value & 0xF0) == 0xE0) {
_code_point = value & 0x0F;
_following = 2;
}
else if ((value & 0xF8) == 0xF0) {
_code_point = value & 0x07;
_following = 3;
}
else if ((value & 0xFC) == 0xF8) {
_code_point = value & 0x03;
_following = 4;
}
else {
// error: incorrect start character
}
}
private:
int _following;
UnicodeChar _code_point;
};